From 679098b53c0c071dcd14cd422cdb85986ff0cb9a Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Wed, 13 Sep 2023 18:28:57 +0100 Subject: [PATCH] Add load_data. --- spikewrap/data_classes/base.py | 124 ++++++++++++++++++++++++ spikewrap/data_classes/preprocessing.py | 58 +++++++++++ spikewrap/examples/load_data.py | 23 +++++ tests/__init__.py | 0 tests/test_integration/__init__.py | 0 tests/test_unit/__init__.py | 0 6 files changed, 205 insertions(+) create mode 100644 spikewrap/data_classes/base.py create mode 100644 spikewrap/data_classes/preprocessing.py create mode 100644 spikewrap/examples/load_data.py delete mode 100644 tests/__init__.py delete mode 100644 tests/test_integration/__init__.py delete mode 100644 tests/test_unit/__init__.py diff --git a/spikewrap/data_classes/base.py b/spikewrap/data_classes/base.py new file mode 100644 index 0000000..f8f57ab --- /dev/null +++ b/spikewrap/data_classes/base.py @@ -0,0 +1,124 @@ +import fnmatch +from collections import UserDict +from collections.abc import ItemsView, KeysView, ValuesView +from dataclasses import dataclass +from itertools import chain +from pathlib import Path +from typing import Callable, Dict, List, Literal + + +@dataclass +class BaseUserDict(UserDict): + """ + Base class for `PreprocessingData` and `SortingData` + used for checking and formatting `base_path`, `sub_name` + and `run_names`. The layout of the `rawdata` and + `derivatives` folder is identical up to the run + folder, allowing use of this class for + preprocessing and sorting. + + Base UserDict that implements the + keys(), values() and items() convenience functions.""" + + base_path: Path + sub_name: str + sessions_and_runs: Dict + + def __post_init__(self) -> None: + self.data: Dict = {} + self.base_path = Path(self.base_path) + self.check_run_names_are_formatted_as_list() + + def check_run_names_are_formatted_as_list(self) -> None: + """""" + for key, value in self.sessions_and_runs.items(): + if not isinstance(value, List): + assert isinstance( + value, str + ), "Run names must be string or list of strings" + self.sessions_and_runs[key] = [value] + + def preprocessing_sessions_and_runs(self): # TODO: type hint + """""" + ordered_ses_names = list( + chain(*[[ses] * len(runs) for ses, runs in self.sessions_and_runs.items()]) + ) + ordered_run_names = list( + chain(*[runs for runs in self.sessions_and_runs.values()]) + ) + + return list(zip(ordered_ses_names, ordered_run_names)) + + def _validate_inputs( + self, + top_level_folder: Literal["rawdata", "derivatives"], + get_top_level_folder: Callable, + get_sub_level_folder: Callable, + get_sub_path: Callable, + get_run_path: Callable, + ) -> None: + """ + Check the rawdata / derivatives path, subject path exists + and ensure run_names is a list of strings. + + Parameters + ---------- + run_names : List[str] + List of run names to process, in order they should be + processed / concatenated. + + Returns + ------- + run_names : List[str] + Validated `run_names` as a List. + """ + assert get_top_level_folder().is_dir(), ( + f"Ensure there is a folder in base path called '" + f"{top_level_folder}'.\n" + f"No {top_level_folder} directory found at " + f"{get_top_level_folder()}\n" + f"where subject-level folders must be placed." + ) + + assert get_sub_level_folder().is_dir(), ( + f"Subject directory not found. {self.sub_name} " + f"is not a folder in {get_top_level_folder()}" + ) + + for ses_name in self.sessions_and_runs.keys(): + assert ( + ses_path := get_sub_path(ses_name) + ).is_dir(), f"{ses_name} was not found at folder path {ses_path}" + + for run_name in self.sessions_and_runs[ses_name]: + assert (run_path := get_run_path(ses_name, run_name)).is_dir(), ( + f"The run folder {run_path.stem} cannot be found at " + f"file path {run_path.parent}." + ) + + gate_str = fnmatch.filter(run_name.split("_"), "g?") + + assert len(gate_str) > 0, ( + f"The SpikeGLX gate index should be in the run name. " + f"It was not found in the name {run_name}." + f"\nEnsure the gate number is in the SpikeGLX-output filename." + ) + + assert len(gate_str) == 1, ( + f"The SpikeGLX gate appears in the name " + f"{run_name} more than once" + ) + + assert int(gate_str[0][1:]) == 0, ( + f"Gate with index larger than 0 is not supported. This is found " + f"in run name {run_name}. " + ) + + def keys(self) -> KeysView: + return self.data.keys() + + def items(self) -> ItemsView: + return self.data.items() + + def values(self) -> ValuesView: + return self.data.values() diff --git a/spikewrap/data_classes/preprocessing.py b/spikewrap/data_classes/preprocessing.py new file mode 100644 index 0000000..632ca83 --- /dev/null +++ b/spikewrap/data_classes/preprocessing.py @@ -0,0 +1,58 @@ +import shutil +from dataclasses import dataclass +from typing import Dict + +import spikeinterface + +from ..utils import utils +from .base import BaseUserDict + + +@dataclass +class PreprocessingData(BaseUserDict): + """ + Dictionary to store SpikeInterface preprocessing recordings. + + Details on the preprocessing steps are held in the dictionary keys e.g. + e.g. 0-raw, 1-raw-bandpass_filter, 2-raw_bandpass_filter-common_average + and recording objects are held in the value. These are generated + by the `pipeline.preprocess.run_preprocessing()` function. + + The class manages paths to raw data and preprocessing output, + as defines methods to dump key information and the SpikeInterface + binary to disk. Note that SI preprocessing is lazy and + preprocessing only run when the recording.get_traces() + is called, or the data is saved to binary. + + Parameters + ---------- + base_path : Union[Path, str] + Path where the rawdata folder containing subjects. + + sub_name : str + 'subject' to preprocess. The subject top level dir should + reside in base_path/rawdata/. + + run_names : Union[List[str], str] + The SpikeGLX run name (i.e. not including the gate index) + or list of run names. + """ + + def __post_init__(self) -> None: + super().__post_init__() + self._validate_rawdata_inputs() + + self.sync: Dict = {} + + for ses_name, run_name in self.preprocessing_sessions_and_runs(): + utils.update(self.data, ses_name, run_name, {"0-raw": None}) + utils.update(self.sync, ses_name, run_name, None) + + def _validate_rawdata_inputs(self) -> None: + self._validate_inputs( + "rawdata", + self.get_rawdata_top_level_path, + self.get_rawdata_sub_path, + self.get_rawdata_ses_path, + self.get_rawdata_run_path, + ) diff --git a/spikewrap/examples/load_data.py b/spikewrap/examples/load_data.py new file mode 100644 index 0000000..0b04e2f --- /dev/null +++ b/spikewrap/examples/load_data.py @@ -0,0 +1,23 @@ +from pathlib import Path + +from spikewrap.pipeline.load_data import load_data + +base_path = Path( + r"/ceph/neuroinformatics/neuroinformatics/scratch/jziminski/ephys/test_data/steve_multi_run/1119617/time-short-multises" +) + +sub_name = "sub-1119617" +sessions_and_runs = { + "ses-001": [ + "run-001_1119617_LSE1_shank12_g0", + "run-002_made_up_g0", + ], + "ses-002": [ + "run-001_1119617_pretest1_shank12_g0", + ], + "ses-003": [ + "run-002_1119617_pretest1_shank12_g0", + ], +} + +loaded_data = load_data(base_path, sub_name, sessions_and_runs, data_format="spikeglx") \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_integration/__init__.py b/tests/test_integration/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_unit/__init__.py b/tests/test_unit/__init__.py deleted file mode 100644 index e69de29..0000000