diff --git a/docs/api.rst b/docs/api.rst index ad6c6c1d8..71bf7414a 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -17,6 +17,7 @@ API :template: class.rst dataset.Dataset + dataset.DatasetSearcher .. _api_meta_ref: diff --git a/examples/01_datasets/01_plot_dataset_io.py b/examples/01_datasets/01_plot_dataset_io.py index f83a214d2..9dcb9c962 100644 --- a/examples/01_datasets/01_plot_dataset_io.py +++ b/examples/01_datasets/01_plot_dataset_io.py @@ -14,7 +14,7 @@ # ----------------------------------------------------------------------------- import os -from nimare.dataset import Dataset +from nimare.dataset import Dataset, DatasetSearcher from nimare.extract import download_nidm_pain from nimare.transforms import ImageTransformer from nimare.utils import get_resource_path @@ -127,8 +127,11 @@ dset.images[["id", "varcope"]].head() ############################################################################### -# Datasets support many search methods +# The DatasetSearcher class can search Datasets # ----------------------------------------------------------------------------- +searcher = DatasetSearcher() + +############################################################################### # There are ``get_[X]`` and ``get_studies_by_[X]`` methods for a range of # possible search criteria. # The ``get_[X]`` methods allow you to search for specific metadata, while the @@ -139,7 +142,7 @@ # by default, and for every requested study if the ``ids`` argument is provided. # If a study does not have the data requested, the returned list will have # ``None`` for that study. -z_images = dset.get_images(imtype="z") +z_images = searcher.get_images(dset, imtype="z") z_images = [str(z) for z in z_images] print("\n".join(z_images)) @@ -148,16 +151,16 @@ # ````````````````````````````````````````````````````````````````````````````` z_transformer = ImageTransformer(target="z") dset = z_transformer.transform(dset) -z_images = dset.get_images(imtype="z") +z_images = searcher.get_images(dset, imtype="z") z_images = [str(z) for z in z_images] print("\n".join(z_images)) ############################################################################### -# Datasets can also search for studies matching criteria +# DatasetSearchers can also search for studies matching criteria # ----------------------------------------------------------------------------- # ``get_studies_by_[X]`` methods return a list of study identifiers matching # the criteria, such as reporting a peak coordinate near a search coordinate. -sel_studies = dset.get_studies_by_coordinate(xyz=[[0, 0, 0]], r=20) +sel_studies = searcher.get_studies_by_coordinate(dset, xyz=[[0, 0, 0]], r=20) print("\n".join(sel_studies)) ############################################################################### diff --git a/examples/02_meta-analyses/07_macm.py b/examples/02_meta-analyses/07_macm.py index ee331d363..39f9ac032 100644 --- a/examples/02_meta-analyses/07_macm.py +++ b/examples/02_meta-analyses/07_macm.py @@ -17,7 +17,7 @@ from nilearn import datasets, image, plotting from nimare.correct import FWECorrector -from nimare.dataset import Dataset +from nimare.dataset import Dataset, DatasetSearcher from nimare.meta.cbma.ale import SCALE from nimare.meta.cbma.mkda import MKDAChi2 @@ -44,7 +44,8 @@ ############################################################################### # Select studies with a reported coordinate in the ROI # ----------------------------------------------------------------------------- -roi_ids = dset.get_studies_by_mask(roi_img) +searcher = DatasetSearcher() +roi_ids = searcher.get_studies_by_mask(dset, roi_img) dset_sel = dset.slice(roi_ids) print(f"{len(roi_ids)}/{len(dset.ids)} studies report at least one coordinate in the ROI") diff --git a/examples/04_decoding/01_plot_discrete_decoders.py b/examples/04_decoding/01_plot_discrete_decoders.py index 123eec0d3..890d09a9e 100644 --- a/examples/04_decoding/01_plot_discrete_decoders.py +++ b/examples/04_decoding/01_plot_discrete_decoders.py @@ -17,7 +17,7 @@ import numpy as np from nilearn.plotting import plot_roi -from nimare.dataset import Dataset +from nimare.dataset import Dataset, DatasetSearcher from nimare.decode import discrete from nimare.utils import get_resource_path @@ -40,7 +40,8 @@ plot_roi(mask_img, draw_cross=False) # Get studies with voxels in the mask -ids = dset.get_studies_by_mask(mask_img) +searcher = DatasetSearcher() +ids = searcher.get_studies_by_mask(dset, mask_img) ############################################################################### # diff --git a/nimare/annotate/cogat.py b/nimare/annotate/cogat.py index a6264598a..6c6365a99 100755 --- a/nimare/annotate/cogat.py +++ b/nimare/annotate/cogat.py @@ -9,7 +9,7 @@ from nimare.annotate import utils from nimare.due import due from nimare.extract import download_cognitive_atlas -from nimare.utils import _uk_to_us +from nimare.extract.utils import _uk_to_us LGR = logging.getLogger(__name__) diff --git a/nimare/base.py b/nimare/base.py index b1ed307bb..b6b244a51 100644 --- a/nimare/base.py +++ b/nimare/base.py @@ -260,7 +260,10 @@ def _collect_inputs(self, dataset, drop_invalid=True): ) if self._required_inputs: - data = dataset.get(self._required_inputs, drop_invalid=drop_invalid) + from nimare.dataset import DatasetSearcher + + searcher = DatasetSearcher() + data = searcher.get(dataset, self._required_inputs, drop_invalid=drop_invalid) # Do not overwrite existing inputs_ attribute. # This is necessary for PairwiseCBMAEstimator, which validates two sets of coordinates # in the same object. diff --git a/nimare/dataset.py b/nimare/dataset.py index 2d605df2a..52e74d289 100755 --- a/nimare/dataset.py +++ b/nimare/dataset.py @@ -26,639 +26,657 @@ LGR = logging.getLogger(__name__) -class Dataset(NiMAREBase): - """Storage container for a coordinate- and/or image-based meta-analytic dataset/database. +class DatasetSearcher(NiMAREBase): + """A tool for searching Datasets.""" - .. versionchanged:: 0.0.9 + def get(self, dataset, dict_, drop_invalid=True): + """Retrieve files and/or metadata from the current Dataset. - * [ENH] Add merge method to Dataset class + Parameters + ---------- + dict_ : :obj:`dict` + Dictionary specifying images or metadata to collect. + Keys should be variables to be used as keys for results dictionary. + Values should be tuples with two values: + type (e.g., 'image' or 'metadata') and specific field corresponding + to column of type-specific DataFrame (e.g., 'z' or 'sample_sizes'). + drop_invalid : :obj:`bool`, optional + Whether to automatically ignore any studies without the required data or not. + Default is False. - .. versionchanged:: 0.0.8 + Returns + ------- + results : :obj:`dict` + A dictionary of lists of requested data. Keys correspond to the keys in ``dict_``. - * [FIX] Set ``nimare.dataset.Dataset.basepath`` in :func:`update_path` using absolute path. + Examples + -------- + >>> searcher = DatasetSearcher() + >>> searcher.get( + >>> dset, {'z_maps': ('image', 'z'), 'sample_sizes': ('metadata', 'sample_sizes')} + >>> ) + >>> searcher.get(dset, {'coordinates': ('coordinates', None)}) + """ + results = {} + results["id"] = dataset.ids + keep_idx = np.arange(len(dataset.ids), dtype=int) + for k, vals in dict_.items(): + if vals[0] == "image": + temp = self.get_images(dataset, imtype=vals[1]) + elif vals[0] == "metadata": + temp = self.get_metadata(dataset, field=vals[1]) + elif vals[0] == "coordinates": + # Break DataFrame down into a list of study-specific DataFrames + temp = [ + dataset.coordinates.loc[dataset.coordinates["id"] == id_] + for id_ in dataset.ids + ] + # Replace empty DataFrames with Nones + temp = [t if t.size else None for t in temp] + elif vals[0] == "annotations": + # Break DataFrame down into a list of study-specific DataFrames + temp = [ + dataset.annotations.loc[dataset.annotations["id"] == id_] + for id_ in dataset.ids + ] + # Replace empty DataFrames with Nones + temp = [t if t.size else None for t in temp] + else: + raise ValueError(f"Input '{vals[0]}' not understood.") - Parameters - ---------- - source : :obj:`str` or :obj:`dict` - JSON file containing dictionary with database information or the dict() - object + results[k] = temp + temp_keep_idx = np.where([t is not None for t in temp])[0] + keep_idx = np.intersect1d(keep_idx, temp_keep_idx) - target : :obj:`str`, optional - Desired coordinate space for coordinates. Names follow NIDM convention. - Default is 'mni152_2mm' (MNI space with 2x2x2 voxels). - This parameter has no impact on images. + # reduce + if drop_invalid and (len(keep_idx) != len(dataset.ids)): + LGR.info(f"Retaining {len(keep_idx)}/{len(dataset.ids)} studies") + elif len(keep_idx) != len(dataset.ids): + raise Exception( + f"Only {len(keep_idx)}/{len(dataset.ids)} in Dataset contain the necessary data. " + "If you want to analyze the subset of studies with required data, " + "set `drop_invalid` to True." + ) - mask : :obj:`str`, :class:`~nibabel.nifti1.Nifti1Image`, \ - :class:`~nilearn.input_data.NiftiMasker` or similar, or None, optional - Mask(er) to use. If None, uses the target space image, with all - non-zero voxels included in the mask. + for k in results: + results[k] = [results[k][i] for i in keep_idx] + if dict_.get(k, [None])[0] in ("coordinates", "annotations"): + results[k] = pd.concat(results[k]) - Attributes - ---------- - space : :obj:`str` - Standard space. Same as ``target`` parameter. + return results - Notes - ----- - Images loaded into a Dataset are assumed to be in the same space. - If images have different resolutions or affines from the Dataset's masker, - then they will be resampled automatically, at the point where they're used, - by :obj:`Dataset.masker`. - """ + def _generic_column_getter(self, dataset, attr, ids=None, column=None, ignore_columns=None): + """Extract information from DataFrame-based attributes. - _id_cols = ["id", "study_id", "contrast_id"] + Parameters + ---------- + attr : :obj:`str` + The name of the DataFrame-format Dataset attribute to search. + ids : :obj:`list` or None, optional + A list of study IDs within which to extract values. + If None, extract values for all studies in the Dataset. + Default is None. + column : :obj:`str` or None, optional + The column from which to extract values. + If None, a list of all columns with valid values will be returned. + Must be a column within Dataset.[attr]. + ignore_columns : :obj:`list` or None, optional + A list of columns to ignore. Only used if ``column`` is None. - def __init__(self, source, target="mni152_2mm", mask=None): - if isinstance(source, str): - with open(source, "r") as f_obj: - data = json.load(f_obj) - elif isinstance(source, dict): - data = source + Returns + ------- + result : :obj:`list` or :obj:`str` + A list of values or a string, depending on if ids is a list (or None) or a string. + """ + if ignore_columns is None: + ignore_columns = dataset._id_cols else: - raise Exception("`source` needs to be a file path or a dictionary") + ignore_columns += dataset._id_cols - # Datasets are organized by study, then experiment - # To generate unique IDs, we combine study ID with experiment ID - # build list of ids - id_columns = ["id", "study_id", "contrast_id"] - all_ids = [] - for pid in data.keys(): - for expid in data[pid]["contrasts"].keys(): - id_ = f"{pid}-{expid}" - all_ids.append([id_, pid, expid]) - id_df = pd.DataFrame(columns=id_columns, data=all_ids) - id_df = id_df.set_index("id", drop=False) - self._ids = id_df.index.values + df = getattr(dataset, attr) + return_first = False - # Set up Masker - if mask is None: - mask = get_template(target, mask="brain") - self.masker = mask - self.space = target + if isinstance(ids, str) and column is not None: + return_first = True + ids = _listify(ids) - self.annotations = _dict_to_df(id_df, data, key="labels") - self.coordinates = _dict_to_coordinates(data, masker=self.masker, space=self.space) - self.images = _dict_to_df(id_df, data, key="images") - self.metadata = _dict_to_df(id_df, data, key="metadata") - self.texts = _dict_to_df(id_df, data, key="text") - self.basepath = None + available_types = [c for c in df.columns if c not in dataset._id_cols] + if (column is not None) and (column not in available_types): + raise ValueError( + f"{column} not found in {attr}.\nAvailable types: {', '.join(available_types)}" + ) - def __repr__(self): - """Show basic Dataset representation. + if column is not None: + if ids is not None: + result = df[column].loc[df["id"].isin(ids)].tolist() + else: + result = df[column].tolist() + else: + if ids is not None: + result = {v: df[v].loc[df["id"].isin(ids)].tolist() for v in available_types} + result = {k: v for k, v in result.items() if any(v)} + else: + result = {v: df[v].tolist() for v in available_types} + result = list(result.keys()) - It's basically the same as the NiMAREBase representation, but with the number of - experiments in the Dataset represented as well. - """ - # Get default parameter values for the object - signature = inspect.signature(self.__init__) - defaults = { - k: v.default - for k, v in signature.parameters.items() - if v.default is not inspect.Parameter.empty - } + if return_first: + return result[0] + else: + return result - # Eliminate any sub-parameters (e.g., parameters for a MetaEstimator's KernelTransformer), - # as well as default values - params = self.get_params() - params = {k: v for k, v in params.items() if "__" not in k} - # Parameter "target" is stored as attribute "space" - # and we want to show it regardless of whether it's the default or not - params["space"] = self.space - params.pop("target") - params = {k: v for k, v in params.items() if defaults.get(k) != v} + def get_labels(self, dataset, ids=None): + """Extract list of labels for which studies in Dataset have annotations. - # Convert to strings - param_strs = [] - for k, v in params.items(): - if isinstance(v, str): - # Wrap string values in single quotes - param_str = f"{k}='{v}'" - else: - # Keep everything else as-is based on its own repr - param_str = f"{k}={v}" - param_strs.append(param_str) + Parameters + ---------- + ids : :obj:`list`, optional + A list of IDs in the Dataset for which to find labels. Default is + None, in which case all labels are returned. - params_str = ", ".join(param_strs) - params_str = f"{len(self.ids)} experiments{', ' if params_str else ''}{params_str}" - rep = f"{self.__class__.__name__}({params_str})" - return rep + Returns + ------- + labels : :obj:`list` + List of labels for which there are annotations in the Dataset. + """ + if not isinstance(ids, list) and ids is not None: + ids = _listify(ids) - @property - def ids(self): - """numpy.ndarray: 1D array of identifiers in Dataset. + result = [c for c in dataset.annotations.columns if c not in dataset._id_cols] + if ids is not None: + temp_annotations = dataset.annotations.loc[dataset.annotations["id"].isin(ids)] + res = temp_annotations[result].any(axis=0) + result = res.loc[res].index.tolist() - The associated setter for this property is private, as ``Dataset.ids`` is immutable. - """ - return self.__ids + return result - @ids.setter - def _ids(self, ids): - ids = np.sort(np.asarray(ids)) - assert isinstance(ids, np.ndarray) and ids.ndim == 1 - self.__ids = ids + def get_texts(self, dataset, ids=None, text_type=None): + """Extract list of texts of a given type for selected IDs. - @property - def masker(self): - """:class:`nilearn.input_data.NiftiMasker` or similar: Masker object. + Parameters + ---------- + ids : :obj:`list`, optional + A list of IDs in the Dataset for which to find texts. Default is + None, in which case all texts of requested type are returned. + text_type : :obj:`str`, optional + Type of text to extract. Corresponds to column name in + Dataset.texts DataFrame. Default is None. - Defines the space and location of the area of interest (e.g., 'brain'). + Returns + ------- + texts : :obj:`list` + List of texts of requested type for selected IDs. """ - return self.__masker + result = self._generic_column_getter(dataset, "texts", ids=ids, column=text_type) + return result - @masker.setter - def masker(self, mask): - mask = get_masker(mask) - if hasattr(self, "masker") and not np.array_equal( - self.masker.mask_img.affine, mask.mask_img.affine - ): - # This message does not have an associated effect, - # since matrix indices are calculated as necessary - LGR.warning("New masker does not match old masker. Space is assumed to be the same.") + def get_metadata(self, dataset, ids=None, field=None): + """Get metadata from Dataset. - self.__masker = mask + Parameters + ---------- + ids : :obj:`list`, optional + A list of IDs in the Dataset for which to find metadata. Default is + None, in which case all metadata of requested type are returned. + field : :obj:`str`, optional + Metadata field to extract. Corresponds to column name in + Dataset.metadata DataFrame. Default is None. - @property - def annotations(self): - """:class:`pandas.DataFrame`: Labels describing studies in the dataset. - - Each study/experiment has its own row. - Columns correspond to individual labels (e.g., 'emotion'), and may - be prefixed with a feature group including two underscores - (e.g., 'Neurosynth_TFIDF__emotion'). + Returns + ------- + metadata : :obj:`list` + List of values of requested type for selected IDs. """ - return self.__annotations - - @annotations.setter - def annotations(self, df): - _validate_df(df) - self.__annotations = df.sort_values(by="id") - - @property - def coordinates(self): - """:class:`pandas.DataFrame`: Coordinates in the dataset. + result = self._generic_column_getter(dataset, "metadata", ids=ids, column=field) + return result - .. versionchanged:: 0.0.10 + def get_images(self, dataset, ids=None, imtype=None): + """Get images of a certain type for a subset of studies in the dataset. - The coordinates attribute no longer includes the associated matrix indices - (columns 'i', 'j', and 'k'). These columns are calculated as needed. + Parameters + ---------- + ids : :obj:`list`, optional + A list of IDs in the Dataset for which to find images. Default is + None, in which case all images of requested type are returned. + imtype : :obj:`str`, optional + Type of image to extract. Corresponds to column name in + Dataset.images DataFrame. Default is None. - Each study has one row for each peak. - Columns include ['x', 'y', 'z'] (peak locations in mm) and 'space' (Dataset's space). + Returns + ------- + images : :obj:`list` + List of images of requested type for selected IDs. """ - return self.__coordinates + ignore_columns = ["space"] + ignore_columns += [c for c in dataset.images.columns if c.endswith("__relative")] + result = self._generic_column_getter( + dataset, + "images", + ids=ids, + column=imtype, + ignore_columns=ignore_columns, + ) + return result - @coordinates.setter - def coordinates(self, df): - _validate_df(df) - self.__coordinates = df.sort_values(by="id") + def get_studies_by_label(self, dataset, labels=None, label_threshold=0.001): + """Extract list of studies with a given label. - @property - def images(self): - """:class:`pandas.DataFrame`: Images in the dataset. + .. versionchanged:: 0.0.10 - Each image type has its own column (e.g., 'z') with absolute paths to - files and each study has its own row. - Additionally, relative paths to image files are stored in columns with - the suffix '__relative' (e.g., 'z__relative'). + Fix bug in which all IDs were returned when a label wasn't present in the Dataset. - Warnings - -------- - Images are assumed to be in the same space, although they may have - different resolutions and affines. Images will be resampled as needed - at the point where they are used, via :obj:`Dataset.masker`. - """ - return self.__images + .. versionchanged:: 0.0.9 - @images.setter - def images(self, df): - _validate_df(df) - self.__images = _validate_images_df(df).sort_values(by="id") + Default value for label_threshold changed to 0.001. - @property - def metadata(self): - """:class:`pandas.DataFrame`: Metadata describing studies in the dataset. + Parameters + ---------- + labels : :obj:`list`, optional + List of labels to use to search Dataset. If a contrast has all of + the labels above the threshold, it will be returned. + Default is None. + label_threshold : :obj:`float`, optional + Default is 0.5. - Each metadata field has its own column (e.g., 'sample_sizes') and each study - has its own row. + Returns + ------- + found_ids : :obj:`list` + A list of IDs from the Dataset found by the search criteria. """ - return self.__metadata - - @metadata.setter - def metadata(self, df): - _validate_df(df) - self.__metadata = df.sort_values(by="id") + if isinstance(labels, str): + labels = [labels] + elif not isinstance(labels, list): + raise ValueError(f"Argument 'labels' cannot be {type(labels)}") - @property - def texts(self): - """:class:`pandas.DataFrame`: Texts in the dataset. + missing_labels = [label for label in labels if label not in dataset.annotations.columns] + if missing_labels: + raise ValueError(f"Missing label(s): {', '.join(missing_labels)}") - Each text type has its own column (e.g., 'abstract') and each study - has its own row. - """ - return self.__texts + temp_annotations = dataset.annotations[dataset._id_cols + labels] + found_rows = (temp_annotations[labels] >= label_threshold).all(axis=1) + if any(found_rows): + found_ids = temp_annotations.loc[found_rows, "id"].tolist() + else: + found_ids = [] - @texts.setter - def texts(self, df): - _validate_df(df) - self.__texts = df.sort_values(by="id") + return found_ids - def slice(self, ids): - """Create a new dataset with only requested IDs. + def get_studies_by_mask(self, dataset, mask): + """Extract list of studies with at least one coordinate in mask. Parameters ---------- - ids : array_like - List of study IDs to include in new dataset + mask : img_like + Mask across which to search for coordinates. Returns ------- - new_dset : :obj:`~nimare.dataset.Dataset` - Reduced Dataset containing only requested studies. + found_ids : :obj:`list` + A list of IDs from the Dataset with at least one focus in the mask. """ - new_dset = copy.deepcopy(self) - new_dset._ids = ids - for attribute in ("annotations", "coordinates", "images", "metadata", "texts"): - df = getattr(new_dset, attribute) - df = df.loc[df["id"].isin(ids)] - setattr(new_dset, attribute, df) + from scipy.spatial.distance import cdist - return new_dset + mask = load_niimg(mask) - def merge(self, right): - """Merge two Datasets. + dset_mask = dataset.masker.mask_img + if not np.array_equal(dset_mask.affine, mask.affine): + LGR.warning("Mask affine does not match Dataset affine. Assuming same space.") - .. versionadded:: 0.0.9 + dset_ijk = mm2vox(dataset.coordinates[["x", "y", "z"]].values, mask.affine) + mask_ijk = np.vstack(np.where(mask.get_fdata())).T + distances = cdist(mask_ijk, dset_ijk) + distances = np.any(distances == 0, axis=0) + found_ids = list(dataset.coordinates.loc[distances, "id"].unique()) + return found_ids + + def get_studies_by_coordinate(self, dataset, xyz, r=20): + """Extract list of studies with at least one focus within radius of requested coordinates. Parameters ---------- - right : :obj:`~nimare.dataset.Dataset` - Dataset to merge with. + xyz : (X x 3) array_like + List of coordinates against which to find studies. + r : :obj:`float`, optional + Radius (in mm) within which to find studies. Default is 20mm. Returns ------- - :obj:`~nimare.dataset.Dataset` - A Dataset of the two merged Datasets. + found_ids : :obj:`list` + A list of IDs from the Dataset with at least one focus within + radius r of requested coordinates. """ - assert isinstance(right, Dataset) - shared_ids = np.intersect1d(self.ids, right.ids) - if shared_ids.size: - raise Exception("Duplicate IDs detected in both datasets.") + from scipy.spatial.distance import cdist - all_ids = np.concatenate((self.ids, right.ids)) - new_dset = copy.deepcopy(self) - new_dset._ids = all_ids + xyz = np.array(xyz) + assert xyz.shape[1] == 3 and xyz.ndim == 2 + distances = cdist(xyz, dataset.coordinates[["x", "y", "z"]].values) + distances = np.any(distances <= r, axis=0) + found_ids = list(dataset.coordinates.loc[distances, "id"].unique()) + return found_ids - for attribute in ("annotations", "coordinates", "images", "metadata", "texts"): - df1 = getattr(self, attribute) - df2 = getattr(right, attribute) - new_df = df1.append(df2, ignore_index=True, sort=False) - new_df.sort_values(by="id", inplace=True) - new_df.reset_index(drop=True, inplace=True) - new_df = new_df.where(~new_df.isna(), None) - setattr(new_dset, attribute, new_df) - new_dset.coordinates = _transform_coordinates_to_space( - new_dset.coordinates, - self.masker, - self.space, - ) +class Dataset(NiMAREBase): + """Storage container for a coordinate- and/or image-based meta-analytic dataset/database. - return new_dset + .. versionchanged:: 0.0.12 - def update_path(self, new_path): - """Update paths to images. + All search methods have been moved out of Dataset and into DatasetSearcher. - Prepends new path to the relative path for files in Dataset.images. + .. versionchanged:: 0.0.9 - Parameters - ---------- - new_path : :obj:`str` - Path to prepend to relative paths of files in Dataset.images. - """ - self.basepath = op.abspath(new_path) - df = self.images - relative_path_cols = [c for c in df if c.endswith("__relative")] - for col in relative_path_cols: - abs_col = col.replace("__relative", "") - if abs_col in df.columns: - LGR.info(f"Overwriting images column {abs_col}") - df[abs_col] = df[col].apply(_try_prepend, prefix=self.basepath) - self.images = df + * [ENH] Add merge method to Dataset class - def copy(self): - """Create a copy of the Dataset.""" - return copy.deepcopy(self) + .. versionchanged:: 0.0.8 - def get(self, dict_, drop_invalid=True): - """Retrieve files and/or metadata from the current Dataset. + * [FIX] Set ``nimare.dataset.Dataset.basepath`` in :func:`update_path` using absolute path. - Parameters - ---------- - dict_ : :obj:`dict` - Dictionary specifying images or metadata to collect. - Keys should be variables to be used as keys for results dictionary. - Values should be tuples with two values: - type (e.g., 'image' or 'metadata') and specific field corresponding - to column of type-specific DataFrame (e.g., 'z' or 'sample_sizes'). - drop_invalid : :obj:`bool`, optional - Whether to automatically ignore any studies without the required data or not. - Default is False. + Parameters + ---------- + source : :obj:`str` or :obj:`dict` + JSON file containing dictionary with database information or the dict() + object - Returns - ------- - results : :obj:`dict` - A dictionary of lists of requested data. Keys correspond to the keys in ``dict_``. + target : :obj:`str`, optional + Desired coordinate space for coordinates. Names follow NIDM convention. + Default is 'mni152_2mm' (MNI space with 2x2x2 voxels). + This parameter has no impact on images. - Examples - -------- - >>> dset.get({'z_maps': ('image', 'z'), 'sample_sizes': ('metadata', 'sample_sizes')}) - >>> dset.get({'coordinates': ('coordinates', None)}) - """ - results = {} - results["id"] = self.ids - keep_idx = np.arange(len(self.ids), dtype=int) - for k, vals in dict_.items(): - if vals[0] == "image": - temp = self.get_images(imtype=vals[1]) - elif vals[0] == "metadata": - temp = self.get_metadata(field=vals[1]) - elif vals[0] == "coordinates": - # Break DataFrame down into a list of study-specific DataFrames - temp = [self.coordinates.loc[self.coordinates["id"] == id_] for id_ in self.ids] - # Replace empty DataFrames with Nones - temp = [t if t.size else None for t in temp] - elif vals[0] == "annotations": - # Break DataFrame down into a list of study-specific DataFrames - temp = [self.annotations.loc[self.annotations["id"] == id_] for id_ in self.ids] - # Replace empty DataFrames with Nones - temp = [t if t.size else None for t in temp] - else: - raise ValueError(f"Input '{vals[0]}' not understood.") + mask : :obj:`str`, :class:`~nibabel.nifti1.Nifti1Image`, \ + :class:`~nilearn.input_data.NiftiMasker` or similar, or None, optional + Mask(er) to use. If None, uses the target space image, with all + non-zero voxels included in the mask. - results[k] = temp - temp_keep_idx = np.where([t is not None for t in temp])[0] - keep_idx = np.intersect1d(keep_idx, temp_keep_idx) + Attributes + ---------- + space : :obj:`str` + Standard space. Same as ``target`` parameter. - # reduce - if drop_invalid and (len(keep_idx) != len(self.ids)): - LGR.info(f"Retaining {len(keep_idx)}/{len(self.ids)} studies") - elif len(keep_idx) != len(self.ids): - raise Exception( - f"Only {len(keep_idx)}/{len(self.ids)} in Dataset contain the necessary data. " - "If you want to analyze the subset of studies with required data, " - "set `drop_invalid` to True." - ) + Notes + ----- + Images loaded into a Dataset are assumed to be in the same space. + If images have different resolutions or affines from the Dataset's masker, + then they will be resampled automatically, at the point where they're used, + by :obj:`Dataset.masker`. + """ - for k in results: - results[k] = [results[k][i] for i in keep_idx] - if dict_.get(k, [None])[0] in ("coordinates", "annotations"): - results[k] = pd.concat(results[k]) + _id_cols = ["id", "study_id", "contrast_id"] - return results + def __init__(self, source, target="mni152_2mm", mask=None): + if isinstance(source, str): + with open(source, "r") as f_obj: + data = json.load(f_obj) + elif isinstance(source, dict): + data = source + else: + raise Exception("`source` needs to be a file path or a dictionary") - def _generic_column_getter(self, attr, ids=None, column=None, ignore_columns=None): - """Extract information from DataFrame-based attributes. + # Datasets are organized by study, then experiment + # To generate unique IDs, we combine study ID with experiment ID + # build list of ids + id_columns = ["id", "study_id", "contrast_id"] + all_ids = [] + for pid in data.keys(): + for expid in data[pid]["contrasts"].keys(): + id_ = f"{pid}-{expid}" + all_ids.append([id_, pid, expid]) + id_df = pd.DataFrame(columns=id_columns, data=all_ids) + id_df = id_df.set_index("id", drop=False) + self._ids = id_df.index.values - Parameters - ---------- - attr : :obj:`str` - The name of the DataFrame-format Dataset attribute to search. - ids : :obj:`list` or None, optional - A list of study IDs within which to extract values. - If None, extract values for all studies in the Dataset. - Default is None. - column : :obj:`str` or None, optional - The column from which to extract values. - If None, a list of all columns with valid values will be returned. - Must be a column within Dataset.[attr]. - ignore_columns : :obj:`list` or None, optional - A list of columns to ignore. Only used if ``column`` is None. + # Set up Masker + if mask is None: + mask = get_template(target, mask="brain") + self.masker = mask + self.space = target - Returns - ------- - result : :obj:`list` or :obj:`str` - A list of values or a string, depending on if ids is a list (or None) or a string. - """ - if ignore_columns is None: - ignore_columns = self._id_cols - else: - ignore_columns += self._id_cols + self.annotations = _dict_to_df(id_df, data, key="labels") + self.coordinates = _dict_to_coordinates(data, masker=self.masker, space=self.space) + self.images = _dict_to_df(id_df, data, key="images") + self.metadata = _dict_to_df(id_df, data, key="metadata") + self.texts = _dict_to_df(id_df, data, key="text") + self.basepath = None - df = getattr(self, attr) - return_first = False + def __repr__(self): + """Show basic Dataset representation. - if isinstance(ids, str) and column is not None: - return_first = True - ids = _listify(ids) + It's basically the same as the NiMAREBase representation, but with the number of + experiments in the Dataset represented as well. + """ + # Get default parameter values for the object + signature = inspect.signature(self.__init__) + defaults = { + k: v.default + for k, v in signature.parameters.items() + if v.default is not inspect.Parameter.empty + } - available_types = [c for c in df.columns if c not in self._id_cols] - if (column is not None) and (column not in available_types): - raise ValueError( - f"{column} not found in {attr}.\nAvailable types: {', '.join(available_types)}" - ) + # Eliminate any sub-parameters (e.g., parameters for a MetaEstimator's KernelTransformer), + # as well as default values + params = self.get_params() + params = {k: v for k, v in params.items() if "__" not in k} + # Parameter "target" is stored as attribute "space" + # and we want to show it regardless of whether it's the default or not + params["space"] = self.space + params.pop("target") + params = {k: v for k, v in params.items() if defaults.get(k) != v} - if column is not None: - if ids is not None: - result = df[column].loc[df["id"].isin(ids)].tolist() - else: - result = df[column].tolist() - else: - if ids is not None: - result = {v: df[v].loc[df["id"].isin(ids)].tolist() for v in available_types} - result = {k: v for k, v in result.items() if any(v)} + # Convert to strings + param_strs = [] + for k, v in params.items(): + if isinstance(v, str): + # Wrap string values in single quotes + param_str = f"{k}='{v}'" else: - result = {v: df[v].tolist() for v in available_types} - result = list(result.keys()) + # Keep everything else as-is based on its own repr + param_str = f"{k}={v}" + param_strs.append(param_str) - if return_first: - return result[0] - else: - return result + params_str = ", ".join(param_strs) + params_str = f"{len(self.ids)} experiments{', ' if params_str else ''}{params_str}" + rep = f"{self.__class__.__name__}({params_str})" + return rep + + @property + def ids(self): + """numpy.ndarray: 1D array of identifiers in Dataset. + + The associated setter for this property is private, as ``Dataset.ids`` is immutable. + """ + return self.__ids + + @ids.setter + def _ids(self, ids): + ids = np.sort(np.asarray(ids)) + assert isinstance(ids, np.ndarray) and ids.ndim == 1 + self.__ids = ids + + @property + def masker(self): + """:class:`nilearn.input_data.NiftiMasker` or similar: Masker object. + + Defines the space and location of the area of interest (e.g., 'brain'). + """ + return self.__masker - def get_labels(self, ids=None): - """Extract list of labels for which studies in Dataset have annotations. + @masker.setter + def masker(self, mask): + mask = get_masker(mask) + if hasattr(self, "masker") and not np.array_equal( + self.masker.mask_img.affine, mask.mask_img.affine + ): + # This message does not have an associated effect, + # since matrix indices are calculated as necessary + LGR.warning("New masker does not match old masker. Space is assumed to be the same.") - Parameters - ---------- - ids : :obj:`list`, optional - A list of IDs in the Dataset for which to find labels. Default is - None, in which case all labels are returned. + self.__masker = mask - Returns - ------- - labels : :obj:`list` - List of labels for which there are annotations in the Dataset. + @property + def annotations(self): + """:class:`pandas.DataFrame`: Labels describing studies in the dataset. + + Each study/experiment has its own row. + Columns correspond to individual labels (e.g., 'emotion'), and may + be prefixed with a feature group including two underscores + (e.g., 'Neurosynth_TFIDF__emotion'). """ - if not isinstance(ids, list) and ids is not None: - ids = _listify(ids) + return self.__annotations - result = [c for c in self.annotations.columns if c not in self._id_cols] - if ids is not None: - temp_annotations = self.annotations.loc[self.annotations["id"].isin(ids)] - res = temp_annotations[result].any(axis=0) - result = res.loc[res].index.tolist() + @annotations.setter + def annotations(self, df): + _validate_df(df) + self.__annotations = df.sort_values(by="id") - return result + @property + def coordinates(self): + """:class:`pandas.DataFrame`: Coordinates in the dataset. - def get_texts(self, ids=None, text_type=None): - """Extract list of texts of a given type for selected IDs. + .. versionchanged:: 0.0.10 - Parameters - ---------- - ids : :obj:`list`, optional - A list of IDs in the Dataset for which to find texts. Default is - None, in which case all texts of requested type are returned. - text_type : :obj:`str`, optional - Type of text to extract. Corresponds to column name in - Dataset.texts DataFrame. Default is None. + The coordinates attribute no longer includes the associated matrix indices + (columns 'i', 'j', and 'k'). These columns are calculated as needed. - Returns - ------- - texts : :obj:`list` - List of texts of requested type for selected IDs. + Each study has one row for each peak. + Columns include ['x', 'y', 'z'] (peak locations in mm) and 'space' (Dataset's space). """ - result = self._generic_column_getter("texts", ids=ids, column=text_type) - return result + return self.__coordinates - def get_metadata(self, ids=None, field=None): - """Get metadata from Dataset. + @coordinates.setter + def coordinates(self, df): + _validate_df(df) + self.__coordinates = df.sort_values(by="id") - Parameters - ---------- - ids : :obj:`list`, optional - A list of IDs in the Dataset for which to find metadata. Default is - None, in which case all metadata of requested type are returned. - field : :obj:`str`, optional - Metadata field to extract. Corresponds to column name in - Dataset.metadata DataFrame. Default is None. + @property + def images(self): + """:class:`pandas.DataFrame`: Images in the dataset. - Returns - ------- - metadata : :obj:`list` - List of values of requested type for selected IDs. + Each image type has its own column (e.g., 'z') with absolute paths to + files and each study has its own row. + Additionally, relative paths to image files are stored in columns with + the suffix '__relative' (e.g., 'z__relative'). + + Warnings + -------- + Images are assumed to be in the same space, although they may have + different resolutions and affines. Images will be resampled as needed + at the point where they are used, via :obj:`Dataset.masker`. """ - result = self._generic_column_getter("metadata", ids=ids, column=field) - return result + return self.__images - def get_images(self, ids=None, imtype=None): - """Get images of a certain type for a subset of studies in the dataset. + @images.setter + def images(self, df): + _validate_df(df) + self.__images = _validate_images_df(df).sort_values(by="id") - Parameters - ---------- - ids : :obj:`list`, optional - A list of IDs in the Dataset for which to find images. Default is - None, in which case all images of requested type are returned. - imtype : :obj:`str`, optional - Type of image to extract. Corresponds to column name in - Dataset.images DataFrame. Default is None. + @property + def metadata(self): + """:class:`pandas.DataFrame`: Metadata describing studies in the dataset. - Returns - ------- - images : :obj:`list` - List of images of requested type for selected IDs. + Each metadata field has its own column (e.g., 'sample_sizes') and each study + has its own row. """ - ignore_columns = ["space"] - ignore_columns += [c for c in self.images.columns if c.endswith("__relative")] - result = self._generic_column_getter( - "images", - ids=ids, - column=imtype, - ignore_columns=ignore_columns, - ) - return result + return self.__metadata - def get_studies_by_label(self, labels=None, label_threshold=0.001): - """Extract list of studies with a given label. + @metadata.setter + def metadata(self, df): + _validate_df(df) + self.__metadata = df.sort_values(by="id") - .. versionchanged:: 0.0.10 + @property + def texts(self): + """:class:`pandas.DataFrame`: Texts in the dataset. - Fix bug in which all IDs were returned when a label wasn't present in the Dataset. + Each text type has its own column (e.g., 'abstract') and each study + has its own row. + """ + return self.__texts - .. versionchanged:: 0.0.9 + @texts.setter + def texts(self, df): + _validate_df(df) + self.__texts = df.sort_values(by="id") - Default value for label_threshold changed to 0.001. + def slice(self, ids): + """Create a new dataset with only requested IDs. Parameters ---------- - labels : :obj:`list`, optional - List of labels to use to search Dataset. If a contrast has all of - the labels above the threshold, it will be returned. - Default is None. - label_threshold : :obj:`float`, optional - Default is 0.5. + ids : array_like + List of study IDs to include in new dataset Returns ------- - found_ids : :obj:`list` - A list of IDs from the Dataset found by the search criteria. + new_dset : :obj:`~nimare.dataset.Dataset` + Reduced Dataset containing only requested studies. """ - if isinstance(labels, str): - labels = [labels] - elif not isinstance(labels, list): - raise ValueError(f"Argument 'labels' cannot be {type(labels)}") - - missing_labels = [label for label in labels if label not in self.annotations.columns] - if missing_labels: - raise ValueError(f"Missing label(s): {', '.join(missing_labels)}") + new_dset = copy.deepcopy(self) + new_dset._ids = ids + for attribute in ("annotations", "coordinates", "images", "metadata", "texts"): + df = getattr(new_dset, attribute) + df = df.loc[df["id"].isin(ids)] + setattr(new_dset, attribute, df) - temp_annotations = self.annotations[self._id_cols + labels] - found_rows = (temp_annotations[labels] >= label_threshold).all(axis=1) - if any(found_rows): - found_ids = temp_annotations.loc[found_rows, "id"].tolist() - else: - found_ids = [] + return new_dset - return found_ids + def merge(self, right): + """Merge two Datasets. - def get_studies_by_mask(self, mask): - """Extract list of studies with at least one coordinate in mask. + .. versionadded:: 0.0.9 Parameters ---------- - mask : img_like - Mask across which to search for coordinates. + right : :obj:`~nimare.dataset.Dataset` + Dataset to merge with. Returns ------- - found_ids : :obj:`list` - A list of IDs from the Dataset with at least one focus in the mask. + :obj:`~nimare.dataset.Dataset` + A Dataset of the two merged Datasets. """ - from scipy.spatial.distance import cdist + assert isinstance(right, Dataset) + shared_ids = np.intersect1d(self.ids, right.ids) + if shared_ids.size: + raise Exception("Duplicate IDs detected in both datasets.") - mask = load_niimg(mask) + all_ids = np.concatenate((self.ids, right.ids)) + new_dset = copy.deepcopy(self) + new_dset._ids = all_ids - dset_mask = self.masker.mask_img - if not np.array_equal(dset_mask.affine, mask.affine): - LGR.warning("Mask affine does not match Dataset affine. Assuming same space.") + for attribute in ("annotations", "coordinates", "images", "metadata", "texts"): + df1 = getattr(self, attribute) + df2 = getattr(right, attribute) + new_df = df1.append(df2, ignore_index=True, sort=False) + new_df.sort_values(by="id", inplace=True) + new_df.reset_index(drop=True, inplace=True) + new_df = new_df.where(~new_df.isna(), None) + setattr(new_dset, attribute, new_df) - dset_ijk = mm2vox(self.coordinates[["x", "y", "z"]].values, mask.affine) - mask_ijk = np.vstack(np.where(mask.get_fdata())).T - distances = cdist(mask_ijk, dset_ijk) - distances = np.any(distances == 0, axis=0) - found_ids = list(self.coordinates.loc[distances, "id"].unique()) - return found_ids + new_dset.coordinates = _transform_coordinates_to_space( + new_dset.coordinates, + self.masker, + self.space, + ) - def get_studies_by_coordinate(self, xyz, r=20): - """Extract list of studies with at least one focus within radius of requested coordinates. + return new_dset + + def update_path(self, new_path): + """Update paths to images. + + Prepends new path to the relative path for files in Dataset.images. Parameters ---------- - xyz : (X x 3) array_like - List of coordinates against which to find studies. - r : :obj:`float`, optional - Radius (in mm) within which to find studies. Default is 20mm. - - Returns - ------- - found_ids : :obj:`list` - A list of IDs from the Dataset with at least one focus within - radius r of requested coordinates. + new_path : :obj:`str` + Path to prepend to relative paths of files in Dataset.images. """ - from scipy.spatial.distance import cdist + self.basepath = op.abspath(new_path) + df = self.images + relative_path_cols = [c for c in df if c.endswith("__relative")] + for col in relative_path_cols: + abs_col = col.replace("__relative", "") + if abs_col in df.columns: + LGR.info(f"Overwriting images column {abs_col}") + df[abs_col] = df[col].apply(_try_prepend, prefix=self.basepath) + self.images = df - xyz = np.array(xyz) - assert xyz.shape[1] == 3 and xyz.ndim == 2 - distances = cdist(xyz, self.coordinates[["x", "y", "z"]].values) - distances = np.any(distances <= r, axis=0) - found_ids = list(self.coordinates.loc[distances, "id"].unique()) - return found_ids + def copy(self): + """Create a copy of the Dataset.""" + return copy.deepcopy(self) diff --git a/nimare/decode/base.py b/nimare/decode/base.py index 7c401ff38..4f2ada899 100644 --- a/nimare/decode/base.py +++ b/nimare/decode/base.py @@ -3,6 +3,7 @@ from abc import abstractmethod from nimare.base import NiMAREBase +from nimare.dataset import DatasetSearcher LGR = logging.getLogger(__name__) @@ -28,7 +29,8 @@ def _collect_inputs(self, dataset, drop_invalid=True): ) if self._required_inputs: - data = dataset.get(self._required_inputs, drop_invalid=drop_invalid) + searcher = DatasetSearcher() + data = searcher.get(dataset, self._required_inputs, drop_invalid=drop_invalid) # Do not overwrite existing inputs_ attribute. # This is necessary for PairwiseCBMAEstimator, which validates two sets of coordinates # in the same object. diff --git a/nimare/decode/continuous.py b/nimare/decode/continuous.py index 11fa7861e..8861a1766 100755 --- a/nimare/decode/continuous.py +++ b/nimare/decode/continuous.py @@ -9,6 +9,7 @@ from tqdm.auto import tqdm from nimare import references +from nimare.dataset import DatasetSearcher from nimare.decode.base import Decoder from nimare.decode.utils import weight_priors from nimare.due import due @@ -181,10 +182,12 @@ def _fit(self, dataset): Masked meta-analytic maps """ self.masker = dataset.masker + searcher = DatasetSearcher() n_features = len(self.features_) for i_feature, feature in enumerate(tqdm(self.features_, total=n_features)): - feature_ids = dataset.get_studies_by_label( + feature_ids = searcher.get_studies_by_label( + dataset, labels=[feature], label_threshold=self.frequency_threshold, ) @@ -289,11 +292,12 @@ def _fit(self, dataset): Masked meta-analytic maps """ self.masker = dataset.masker + searcher = DatasetSearcher() images_ = {} for feature in self.features_: - feature_ids = dataset.get_studies_by_label( - labels=[feature], label_threshold=self.frequency_threshold + feature_ids = searcher.get_studies_by_label( + dataset, labels=[feature], label_threshold=self.frequency_threshold ) selected_ids = sorted(list(set(feature_ids).intersection(self.inputs_["id"]))) selected_id_idx = [ diff --git a/nimare/extract/utils.py b/nimare/extract/utils.py index 710dccafe..5341fa6be 100644 --- a/nimare/extract/utils.py +++ b/nimare/extract/utils.py @@ -4,13 +4,14 @@ import logging import os import os.path as op +import re import numpy as np import pandas as pd import requests from fuzzywuzzy import fuzz -from nimare.utils import _uk_to_us +from nimare.utils import get_resource_path LGR = logging.getLogger(__name__) @@ -292,3 +293,30 @@ def _expand_df(df): df["ratio"] = df[["alias", "name"]].apply(_get_ratio, axis=1) df = df.sort_values(by=["length", "ratio"], ascending=[False, False]) return df + + +def _uk_to_us(text): + """Convert UK spellings to US based on a converter. + + .. versionadded:: 0.0.2 + + Parameters + ---------- + text : :obj:`str` + + Returns + ------- + text : :obj:`str` + + Notes + ----- + The english_spellings.csv file is from http://www.tysto.com/uk-us-spelling-list.html. + """ + SPELL_DF = pd.read_csv(op.join(get_resource_path(), "english_spellings.csv"), index_col="UK") + SPELL_DICT = SPELL_DF["US"].to_dict() + + if isinstance(text, str): + # Convert British to American English + pattern = re.compile(r"\b(" + "|".join(SPELL_DICT.keys()) + r")\b") + text = pattern.sub(lambda x: SPELL_DICT[x.group()], text) + return text diff --git a/nimare/meta/cbma/base.py b/nimare/meta/cbma/base.py index 81a7229f8..ec4c0142d 100644 --- a/nimare/meta/cbma/base.py +++ b/nimare/meta/cbma/base.py @@ -11,13 +11,17 @@ from tqdm.auto import tqdm from nimare.base import Estimator +from nimare.dataset import DatasetSearcher from nimare.meta.kernel import KernelTransformer -from nimare.meta.utils import _calculate_cluster_measures, _get_last_bin +from nimare.meta.utils import ( + _add_metadata_to_dataframe, + _calculate_cluster_measures, + _get_last_bin, +) from nimare.results import MetaResult from nimare.stats import null_to_p, nullhist_to_p from nimare.transforms import p_to_z from nimare.utils import ( - _add_metadata_to_dataframe, _check_ncores, _check_type, get_masker, @@ -100,6 +104,7 @@ def _preprocess_input(self, dataset): (2) IJK coordinates will be added based on the mask image's affine, and (3) sample sizes may be added to the "coordinates" key, as needed. """ + searcher = DatasetSearcher() masker = self.masker or dataset.masker mask_img = masker.mask_img or masker.labels_img @@ -112,7 +117,8 @@ def _preprocess_input(self, dataset): if hasattr(self, "kernel_transformer"): self.kernel_transformer._infer_names(affine=md5(mask_img.affine).hexdigest()) if self.kernel_transformer.image_type in dataset.images.columns: - files = dataset.get_images( + files = searcher.get_images( + dataset, ids=self.inputs_["id"], imtype=self.kernel_transformer.image_type, ) diff --git a/nimare/meta/kernel.py b/nimare/meta/kernel.py index 53e24de55..3b01b7d03 100644 --- a/nimare/meta/kernel.py +++ b/nimare/meta/kernel.py @@ -19,14 +19,16 @@ from nimare import references from nimare.base import NiMAREBase +from nimare.dataset import DatasetSearcher from nimare.due import due from nimare.meta.utils import ( + _add_metadata_to_dataframe, compute_ale_ma, compute_kda_ma, compute_p2m_ma, get_ale_kernel, ) -from nimare.utils import _add_metadata_to_dataframe, _safe_transform, mm2vox, vox2mm +from nimare.utils import _safe_transform, mm2vox, vox2mm LGR = logging.getLogger(__name__) @@ -134,6 +136,7 @@ def transform(self, dataset, masker=None, return_type="image"): # but has different affine, from original IJK. coordinates[["i", "j", "k"]] = mm2vox(coordinates[["x", "y", "z"]], mask.affine) else: + searcher = DatasetSearcher() masker = dataset.masker if not masker else masker mask = masker.mask_img coordinates = dataset.coordinates.copy() @@ -145,7 +148,11 @@ def transform(self, dataset, masker=None, return_type="image"): # Use coordinates to get IDs instead of Dataset.ids bc of possible # mismatch between full Dataset and contrasts with coordinates. if self.image_type in dataset.images.columns: - files = dataset.get_images(ids=coordinates["id"].unique(), imtype=self.image_type) + files = searcher.get_images( + dataset, + ids=coordinates["id"].unique(), + imtype=self.image_type, + ) if all(f is not None for f in files): LGR.debug("Files already exist. Using them.") if return_type == "array": diff --git a/nimare/meta/utils.py b/nimare/meta/utils.py index e2d24d25d..e56819385 100755 --- a/nimare/meta/utils.py +++ b/nimare/meta/utils.py @@ -5,10 +5,12 @@ import nibabel as nib import numpy as np import numpy.linalg as npl +import pandas as pd import sparse from scipy import ndimage from nimare import references +from nimare.dataset import DatasetSearcher from nimare.due import due from nimare.extract import download_peaks2maps_model @@ -16,6 +18,75 @@ LGR = logging.getLogger(__name__) +def _add_metadata_to_dataframe( + dataset, + dataframe, + metadata_field, + target_column, + filter_func=np.mean, +): + """Add metadata from a Dataset to a DataFrame. + + .. versionadded:: 0.0.8 + + This is particularly useful for kernel transformers or estimators where a given metadata field + is necessary (e.g., ALEKernel with "sample_size"), but we want to just use the coordinates + DataFrame instead of passing the full Dataset. + + Parameters + ---------- + dataset : :obj:`~nimare.dataset.Dataset` + Dataset containing study IDs and metadata to feed into dataframe. + dataframe : :obj:`pandas.DataFrame` + DataFrame containing study IDs, into which Dataset metadata will be merged. + metadata_field : :obj:`str` + Metadata field in ``dataset``. + target_column : :obj:`str` + Name of the column that will be added to ``dataframe``, containing information from the + Dataset. + filter_func : :obj:`function`, optional + Function to apply to the metadata so that it fits as a column in a DataFrame. + Default is ``numpy.mean``. + + Returns + ------- + dataframe : :obj:`pandas.DataFrame` + Updated DataFrame with ``target_column`` added. + """ + dataframe = dataframe.copy() + searcher = DatasetSearcher() + + if metadata_field in searcher.get_metadata(dataset): + # Collect metadata from Dataset + metadata = searcher.get_metadata(dataset, field=metadata_field, ids=dataset.ids) + metadata = [[m] for m in metadata] + # Create a DataFrame with the metadata + metadata = pd.DataFrame( + index=dataset.ids, + data=metadata, + columns=[metadata_field], + ) + # Reduce the metadata (if in list/array format) to single values + metadata[target_column] = metadata[metadata_field].apply(filter_func) + # Merge metadata df into coordinates df + dataframe = dataframe.merge( + right=metadata, + left_on="id", + right_index=True, + sort=False, + validate="many_to_one", + suffixes=(False, False), + how="left", + ) + else: + LGR.warning( + f"Metadata field '{metadata_field}' not found. " + "Set a constant value for this field as an argument, if possible." + ) + + return dataframe + + def model_fn(features, labels, mode, params): """Run model function used internally by peaks2maps. diff --git a/nimare/tests/test_dataset.py b/nimare/tests/test_dataset.py index 65bc26238..b102dd659 100644 --- a/nimare/tests/test_dataset.py +++ b/nimare/tests/test_dataset.py @@ -10,6 +10,34 @@ from nimare.tests.utils import get_test_data_path +def test_DatasetSearcher(testdata_laird): + """Test the DatasetSearcher class.""" + dset = testdata_laird.copy() + searcher = dataset.DatasetSearcher() + METHODS = [searcher.get_images, searcher.get_labels, searcher.get_metadata, searcher.get_texts] + for method in METHODS: + assert isinstance(method(dset), list) + assert isinstance(method(dset, ids=dset.ids[:5]), list) + assert isinstance(method(dset, ids=dset.ids[0]), list) + + # This test dataset has no images + with pytest.raises(ValueError): + searcher.get_images(dset, imtype="beta") + + assert isinstance(searcher.get_metadata(dset, field="journal"), list) + assert isinstance(searcher.get_studies_by_label(dset, "Neurosynth_TFIDF__analyze"), list) + assert isinstance(searcher.get_studies_by_coordinate(dset, np.array([[20, 20, 20]])), list) + + mask_data = np.zeros(dset.masker.mask_img.shape, int) + mask_data[40, 40, 40] = 1 + mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine) + assert isinstance(searcher.get_studies_by_mask(dset, mask=mask_img), list) + + # If label is not available, raise ValueError + with pytest.raises(ValueError): + searcher.get_studies_by_label(dset, "dog") + + def test_dataset_smoke(): """Smoke test for nimare.dataset.Dataset initialization and get methods.""" db_file = op.join(get_test_data_path(), "neurosynth_dset.json") @@ -19,26 +47,6 @@ def test_dataset_smoke(): # Test that Dataset.masker is portable assert not nib.is_proxy(dset.masker.mask_img_.dataobj) - methods = [dset.get_images, dset.get_labels, dset.get_metadata, dset.get_texts] - for method in methods: - assert isinstance(method(), list) - assert isinstance(method(ids=dset.ids[:5]), list) - assert isinstance(method(ids=dset.ids[0]), list) - - assert isinstance(dset.get_images(imtype="beta"), list) - assert isinstance(dset.get_metadata(field="sample_sizes"), list) - assert isinstance(dset.get_studies_by_label("cogat_cognitive_control"), list) - assert isinstance(dset.get_studies_by_coordinate(np.array([[20, 20, 20]])), list) - - # If label is not available, raise ValueError - with pytest.raises(ValueError): - dset.get_studies_by_label("dog") - - mask_data = np.zeros(dset.masker.mask_img.shape, int) - mask_data[40, 40, 40] = 1 - mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine) - assert isinstance(dset.get_studies_by_mask(mask_img), list) - dset1 = dset.slice(dset.ids[:5]) dset2 = dset.slice(dset.ids[5:]) assert isinstance(dset1, dataset.Dataset) diff --git a/nimare/tests/test_decode_continuous.py b/nimare/tests/test_decode_continuous.py index c4a31738d..ee70239aa 100644 --- a/nimare/tests/test_decode_continuous.py +++ b/nimare/tests/test_decode_continuous.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from nimare.dataset import DatasetSearcher from nimare.decode import continuous from nimare.meta import kernel, mkda @@ -12,7 +13,8 @@ def test_CorrelationDecoder_smoke(testdata_laird): """Smoke test for continuous.CorrelationDecoder.""" testdata_laird = testdata_laird.copy() - features = testdata_laird.get_labels(ids=testdata_laird.ids[0])[:5] + searcher = DatasetSearcher() + features = searcher.get_labels(testdata_laird, ids=testdata_laird.ids[0])[:5] decoder = continuous.CorrelationDecoder(features=features) decoder.fit(testdata_laird) @@ -29,7 +31,8 @@ def test_CorrelationDistributionDecoder_smoke(testdata_laird, tmp_path_factory): tmpdir = tmp_path_factory.mktemp("test_CorrelationDistributionDecoder") testdata_laird = testdata_laird.copy() - features = testdata_laird.get_labels(ids=testdata_laird.ids[0])[:5] + searcher = DatasetSearcher() + features = searcher.get_labels(testdata_laird, ids=testdata_laird.ids[0])[:5] decoder = continuous.CorrelationDistributionDecoder(features=features) diff --git a/nimare/tests/test_decode_discrete.py b/nimare/tests/test_decode_discrete.py index 17b65c771..d607dbddc 100644 --- a/nimare/tests/test_decode_discrete.py +++ b/nimare/tests/test_decode_discrete.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from nimare.dataset import DatasetSearcher from nimare.decode import discrete @@ -38,8 +39,9 @@ def test_brainmap_decode(testdata_laird): def test_NeurosynthDecoder(testdata_laird): """Smoke test for discrete.NeurosynthDecoder.""" + searcher = DatasetSearcher() ids = testdata_laird.ids[:5] - labels = testdata_laird.get_labels(ids=testdata_laird.ids) + labels = searcher.get_labels(testdata_laird, ids=testdata_laird.ids) decoder = discrete.NeurosynthDecoder(features=labels) decoder.fit(testdata_laird) decoded_df = decoder.transform(ids=ids) @@ -65,8 +67,9 @@ def test_NeurosynthDecoder_featuregroup_failure(testdata_laird): def test_BrainMapDecoder(testdata_laird): """Smoke test for discrete.BrainMapDecoder.""" + searcher = DatasetSearcher() ids = testdata_laird.ids[:5] - labels = testdata_laird.get_labels(ids=testdata_laird.ids) + labels = searcher.get_labels(testdata_laird, ids=testdata_laird.ids) decoder = discrete.BrainMapDecoder(features=labels) decoder.fit(testdata_laird) decoded_df = decoder.transform(ids=ids) @@ -83,7 +86,8 @@ def test_BrainMapDecoder_failure(testdata_laird): def test_ROIAssociationDecoder(testdata_laird, roi_img): """Smoke test for discrete.ROIAssociationDecoder.""" - labels = testdata_laird.get_labels(ids=testdata_laird.ids) + searcher = DatasetSearcher() + labels = searcher.get_labels(testdata_laird, ids=testdata_laird.ids) decoder = discrete.ROIAssociationDecoder(masker=roi_img, features=labels) decoder.fit(testdata_laird) decoded_df = decoder.transform() diff --git a/nimare/tests/test_workflows.py b/nimare/tests/test_workflows.py index 526ec90ac..0fffb4c52 100644 --- a/nimare/tests/test_workflows.py +++ b/nimare/tests/test_workflows.py @@ -2,6 +2,7 @@ import os.path as op from nimare import cli, workflows +from nimare.dataset import DatasetSearcher from nimare.tests.utils import get_test_data_path @@ -127,7 +128,8 @@ def test_conperm_workflow_function_smoke(testdata_ibma, tmp_path_factory): """Run smoke test of the contrast permutation workflow as a function.""" tmpdir = tmp_path_factory.mktemp("test_conperm_workflow_function_smoke") dset = testdata_ibma - files = dset.get_images(imtype="beta") + searcher = DatasetSearcher() + files = searcher.get_images(dset, imtype="beta") mask_image = op.join(get_test_data_path(), "test_pain_dataset", "mask.nii.gz") prefix = "test" @@ -142,7 +144,8 @@ def test_conperm_workflow_cli_smoke(testdata_ibma, tmp_path_factory): """Run smoke test of the contrast permutation workflow as a CLI.""" tmpdir = tmp_path_factory.mktemp("test_conperm_workflow_cli_smoke") dset = testdata_ibma - files = dset.get_images(imtype="beta") + searcher = DatasetSearcher() + files = searcher.get_images(dset, imtype="beta") mask_image = op.join(get_test_data_path(), "test_pain_dataset", "mask.nii.gz") prefix = "test" diff --git a/nimare/utils.py b/nimare/utils.py index e49d73204..1d337682d 100755 --- a/nimare/utils.py +++ b/nimare/utils.py @@ -643,33 +643,6 @@ def _find_stem(arr): return res -def _uk_to_us(text): - """Convert UK spellings to US based on a converter. - - .. versionadded:: 0.0.2 - - Parameters - ---------- - text : :obj:`str` - - Returns - ------- - text : :obj:`str` - - Notes - ----- - The english_spellings.csv file is from http://www.tysto.com/uk-us-spelling-list.html. - """ - SPELL_DF = pd.read_csv(op.join(get_resource_path(), "english_spellings.csv"), index_col="UK") - SPELL_DICT = SPELL_DF["US"].to_dict() - - if isinstance(text, str): - # Convert British to American English - pattern = re.compile(r"\b(" + "|".join(SPELL_DICT.keys()) + r")\b") - text = pattern.sub(lambda x: SPELL_DICT[x.group()], text) - return text - - def use_memmap(logger, n_files=1): """Memory-map array to a file, and perform cleanup after. @@ -824,74 +797,6 @@ def _safe_transform(imgs, masker, memory_limit="1gb", dtype="auto", memfile=None return masked_data -def _add_metadata_to_dataframe( - dataset, - dataframe, - metadata_field, - target_column, - filter_func=np.mean, -): - """Add metadata from a Dataset to a DataFrame. - - .. versionadded:: 0.0.8 - - This is particularly useful for kernel transformers or estimators where a given metadata field - is necessary (e.g., ALEKernel with "sample_size"), but we want to just use the coordinates - DataFrame instead of passing the full Dataset. - - Parameters - ---------- - dataset : :obj:`~nimare.dataset.Dataset` - Dataset containing study IDs and metadata to feed into dataframe. - dataframe : :obj:`pandas.DataFrame` - DataFrame containing study IDs, into which Dataset metadata will be merged. - metadata_field : :obj:`str` - Metadata field in ``dataset``. - target_column : :obj:`str` - Name of the column that will be added to ``dataframe``, containing information from the - Dataset. - filter_func : :obj:`function`, optional - Function to apply to the metadata so that it fits as a column in a DataFrame. - Default is ``numpy.mean``. - - Returns - ------- - dataframe : :obj:`pandas.DataFrame` - Updated DataFrame with ``target_column`` added. - """ - dataframe = dataframe.copy() - - if metadata_field in dataset.get_metadata(): - # Collect metadata from Dataset - metadata = dataset.get_metadata(field=metadata_field, ids=dataset.ids) - metadata = [[m] for m in metadata] - # Create a DataFrame with the metadata - metadata = pd.DataFrame( - index=dataset.ids, - data=metadata, - columns=[metadata_field], - ) - # Reduce the metadata (if in list/array format) to single values - metadata[target_column] = metadata[metadata_field].apply(filter_func) - # Merge metadata df into coordinates df - dataframe = dataframe.merge( - right=metadata, - left_on="id", - right_index=True, - sort=False, - validate="many_to_one", - suffixes=(False, False), - how="left", - ) - else: - LGR.warning( - f"Metadata field '{metadata_field}' not found. " - "Set a constant value for this field as an argument, if possible." - ) - - return dataframe - - def _check_type(obj, clss, **kwargs): """Check variable type and initialize if necessary. diff --git a/nimare/workflows/ale.py b/nimare/workflows/ale.py index acc572d0a..54d56d7fe 100644 --- a/nimare/workflows/ale.py +++ b/nimare/workflows/ale.py @@ -7,6 +7,7 @@ import numpy as np from nimare.correct import FWECorrector +from nimare.dataset import DatasetSearcher from nimare.diagnostics import FocusCounter from nimare.io import convert_sleuth_to_dataset from nimare.meta import ALE, ALESubtraction @@ -26,6 +27,7 @@ def ale_sleuth_workflow( ): """Perform ALE meta-analysis from Sleuth text file.""" LGR.info("Loading coordinates...") + searcher = DatasetSearcher() if fwhm: fwhm_str = f"of {fwhm} mm" @@ -34,7 +36,7 @@ def ale_sleuth_workflow( if not sleuth_file2: dset = convert_sleuth_to_dataset(sleuth_file, target="ale_2mm") - n_subs = dset.get_metadata(field="sample_sizes") + n_subs = searcher.get_metadata(dset, field="sample_sizes") n_subs = np.sum(n_subs) boilerplate = """ @@ -115,9 +117,9 @@ def ale_sleuth_workflow( else: dset1 = convert_sleuth_to_dataset(sleuth_file, target="ale_2mm") dset2 = convert_sleuth_to_dataset(sleuth_file2, target="ale_2mm") - n_subs1 = dset1.get_metadata(field="sample_sizes") + n_subs1 = searcher.get_metadata(dset1, field="sample_sizes") n_subs1 = np.sum(n_subs1) - n_subs2 = dset2.get_metadata(field="sample_sizes") + n_subs2 = searcher.get_metadata(dset2, field="sample_sizes") n_subs2 = np.sum(n_subs2) boilerplate = """ diff --git a/nimare/workflows/macm.py b/nimare/workflows/macm.py index 9b523db20..e1d5f439c 100644 --- a/nimare/workflows/macm.py +++ b/nimare/workflows/macm.py @@ -5,7 +5,7 @@ from shutil import copyfile from nimare.correct import FWECorrector -from nimare.dataset import Dataset +from nimare.dataset import Dataset, DatasetSearcher from nimare.meta import ALE LGR = logging.getLogger(__name__) @@ -17,7 +17,8 @@ def macm_workflow( """Perform MACM with ALE algorithm.""" LGR.info("Loading coordinates...") dset = Dataset(dataset_file) - sel_ids = dset.get_studies_by_mask(mask_file) + searcher = DatasetSearcher() + sel_ids = searcher.get_studies_by_mask(dset, mask_file) sel_dset = dset.slice(sel_ids) # override sample size