diff --git a/invokers/run_gridsearch.py b/invokers/run_gridsearch.py new file mode 100644 index 00000000..62727751 --- /dev/null +++ b/invokers/run_gridsearch.py @@ -0,0 +1,95 @@ +from pathlib import Path +import argparse +from kymata.gridsearch.gridsearch import do_gridsearch +from kymata.io.functions import load_function +from kymata.io.mne import load_emeg_pack + + +def main(): + + parser = argparse.ArgumentParser(description='Gridsearch Params') + parser.add_argument('--emeg_sample_rate', type=int, default=1000, + help='sampling rate of the emeg machine (not implemented yet)') + parser.add_argument('--snr', type=float, default=3, + help='inverse solution snr') + parser.add_argument('--downsample_rate', type=int, default=5, + help='downsample_rate') + parser.add_argument('--base_dir', type=str, default="/imaging/projects/cbu/kymata/data/dataset_4-english-narratives/", + help='base data directory') + parser.add_argument('--data_path', type=str, default="intrim_preprocessing_files/3_trialwise_sensorspace/evoked_data", + help='data path after base dir') + parser.add_argument('--function_path', type=str, default="predicted_function_contours/GMSloudness/stimulisig", + help='snr') + parser.add_argument('--function_name', type=str, default="d_IL2", + help='function name in stimulisig') + parser.add_argument('--emeg_file', type=str, default="participant_01-ave", + help='emeg_file_name') + parser.add_argument('--ave_mode', type=str, default="ave", + help='either ave or add, either average over the list of repetitions or treat them as extra data') + parser.add_argument('--inverse_operator', type=str, default="intrim_preprocessing_files/4_hexel_current_reconstruction/inverse-operators", + help='inverse solution path') + parser.add_argument('--seconds_per_split', type=float, default=0.5, + help='seconds in each split of the recording, also maximum range of latencies being checked') + parser.add_argument('--n_splits', type=int, default=800, + help='number of splits to split the recording into, (set to 400/seconds_per_split for full file)') + parser.add_argument('--n_derangements', type=int, default=1, + help='inverse solution snr') + parser.add_argument('--start_latency', type=float, default=-100, + help='earliest latency to check in cross correlation') + parser.add_argument('--emeg_t_start', type=float, default=-200, + help='start of the emeg evoked files relative to the start of the function') + parser.add_argument('--audio_shift_correction', type=float, default=0.000_537_5, + help='audio shift correction, for every second of function, add this number of seconds (to the start of the emeg split) per seconds of emeg seen') + args = parser.parse_args() + args.base_dir = Path(args.base_dir) + + + emeg_dir = Path(args.base_dir, args.data_path) + emeg_paths = [Path(emeg_dir, args.emeg_file)] + + participants = ['participant_01', + 'participant_01b', + 'participant_02', + 'participant_03', + 'participant_04', + 'participant_05', + 'pilot_01', + 'pilot_02'] + + reps = [f'_rep{i}' for i in range(8)] + ['-ave'] + + # emeg_paths = [Path(emeg_dir, p + r) for p in participants[:2] for r in reps[-1:]] + + inverse_operator = Path(args.base_dir, args.inverse_operator, f"{participants[0]}_ico5-3L-loose02-cps-nodepth.fif") + + # Load data + emeg, ch_names = load_emeg_pack(emeg_paths, + need_names=False, + ave_mode=args.ave_mode, + inverse_operator=None, #inverse_operator, # set to None/inverse_operator if you want to run on sensor space/source space + p_tshift=None, + snr=args.snr) + + func = load_function(Path(args.base_dir, args.function_path), + func_name=args.function_name, + bruce_neurons=(5, 10)) + func = func.downsampled(args.downsample_rate) + + es = do_gridsearch( + emeg_values=emeg, + sensor_names=ch_names, + function=func, + seconds_per_split=args.seconds_per_split, + n_derangements=args.n_derangements, + n_splits=args.n_splits, + start_latency=args.start_latency, + emeg_t_start=args.emeg_t_start, + emeg_sample_rate=args.emeg_sample_rate, + audio_shift_correction=args.audio_shift_correction, + ave_mode=args.ave_mode, + ) + + # expression_plot(es) + +if __name__ == '__main__': + main() diff --git a/invokers/run_preprocessing_dataset4.py b/invokers/run_preprocessing_dataset4.py new file mode 100644 index 00000000..bd342134 --- /dev/null +++ b/invokers/run_preprocessing_dataset4.py @@ -0,0 +1,33 @@ +from pathlib import Path + +import sys +sys.path.append('/imaging/projects/cbu/kymata/analyses/ollie/kymata-toolbox') + +from kymata.io.yaml import load_config +from kymata.preproc.data_cleansing import run_preprocessing, create_trials, create_trialwise_data + + +# noinspection DuplicatedCode +def main(): + config = load_config(str(Path(Path(__file__).parent.parent, "kymata", "config", "dataset4.yaml"))) + + create_trialwise_data( + dataset_directory_name=config['dataset_directory_name'], + list_of_participants=config['list_of_participants'], + repetitions_per_runs=config['repetitions_per_runs'], + number_of_runs=config['number_of_runs'], + number_of_trials=config['number_of_trials'], + input_streams=config['input_streams'], + eeg_thresh=float(config['eeg_thresh']), + grad_thresh=float(config['grad_thresh']), + mag_thresh=float(config['mag_thresh']), + visual_delivery_latency=config['visual_delivery_latency'], + audio_delivery_latency=config['audio_delivery_latency'], + audio_delivery_shift_correction=config['audio_delivery_shift_correction'], + tmin=config['tmin'], + tmax=config['tmax'], + ) + + +if __name__ == '__main__': + main() diff --git a/kymata/config/dataset3.yaml b/kymata/config/dataset3.yaml index 6baada0a..13486e19 100644 --- a/kymata/config/dataset3.yaml +++ b/kymata/config/dataset3.yaml @@ -45,9 +45,9 @@ supress_excessive_plots_and_prompts: True audio_delivery_latency: 26 # in milliseconds visual_delivery_latency: 17 # in milliseconds tactile_delivery_latency: 0 # in milliseconds -audio_delivery_shift_correction: 0 # in milliseconds per second -tmin: -0.2 -tmax: 1.8 +audio_delivery_shift_correction: 0 # in seconds per second +tmin: -0.2 # seconds +tmax: 1.8 # seconds eeg_thresh: 1 #200e-6 grad_thresh: 1 #4000e-13 diff --git a/kymata/config/dataset4.yaml b/kymata/config/dataset4.yaml old mode 100644 new mode 100755 index 26bf33ae..842f9f41 --- a/kymata/config/dataset4.yaml +++ b/kymata/config/dataset4.yaml @@ -36,9 +36,9 @@ meg: True audio_delivery_latency: 26 # in milliseconds visual_delivery_latency: 17 # in milliseconds tactile_delivery_latency: 0 # in milliseconds -audio_delivery_shift_correction: 0.5375 # in milliseconds per second -tmin: -0.2 -tmax: 1.8 +audio_delivery_shift_correction: 0.0005375 # in seconds per second +tmin: -0.2 # seconds +tmax: 1.8 # seconds eeg_thresh: 1 #200e-6 grad_thresh: 1 #4000e-13 diff --git a/kymata/entities/functions.py b/kymata/entities/functions.py new file mode 100644 index 00000000..cba27b7b --- /dev/null +++ b/kymata/entities/functions.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass + +from numpy.typing import NDArray + + +@dataclass +class Function: + name: str + values: NDArray + sample_rate: float # Hertz + + def downsampled(self, rate: int): + return Function( + name=self.name, + values=self.values[::rate], + sample_rate=self.sample_rate / rate, + ) + + @property + def time_step(self) -> float: + return 1 / self.sample_rate diff --git a/kymata/entities/iterables.py b/kymata/entities/iterables.py index 5faeb602..c2a1c910 100644 --- a/kymata/entities/iterables.py +++ b/kymata/entities/iterables.py @@ -6,6 +6,7 @@ from typing import Sequence from numpy import ndarray +from numpy.typing import NDArray def all_equal(sequence: Sequence) -> bool: @@ -20,10 +21,10 @@ def all_equal(sequence: Sequence) -> bool: # numpy arrays deal with equality weirdly # Check first two items are equal, and equal to the rest - first: ndarray = sequence[0] + first: NDArray = sequence[0] if not isinstance(sequence[1], ndarray): return False - second: ndarray = sequence[1] + second: NDArray = sequence[1] try: # noinspection PyUnresolvedReferences return (first == second).all() and all_equal(sequence[1:]) diff --git a/kymata/entities/sparse_data.py b/kymata/entities/sparse_data.py old mode 100644 new mode 100755 diff --git a/kymata/gridsearch/__init__.py b/kymata/gridsearch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kymata/gridsearch/gridsearch.py b/kymata/gridsearch/gridsearch.py new file mode 100644 index 00000000..defb95da --- /dev/null +++ b/kymata/gridsearch/gridsearch.py @@ -0,0 +1,190 @@ +import numpy as np +from numpy.typing import NDArray +from scipy import stats + +from kymata.entities.functions import Function +from kymata.math.combinatorics import generate_derangement +from kymata.math.vector import normalize, get_stds +#from kymata.entities.expression import SensorExpressionSet, p_to_logp +import matplotlib.pyplot as plt + +def do_gridsearch( + emeg_values: NDArray, # chan x time + function: Function, + sensor_names: list[str], + start_latency: float, # ms + emeg_t_start: float, # ms + emeg_sample_rate: int = 1000, # Hertz + audio_shift_correction: float = 0.000_537_5, # seconds/second # TODO: describe in which direction? + n_derangements: int = 1, + seconds_per_split: float = 0.5, + n_splits: int = 800, + ave_mode: str = 'ave', # either ave or add, for averaging over input files or adding in as extra evidence + add_autocorr: bool = True, + plot_name: str = 'example' + ): + """ + Do the Kymata gridsearch over all hexels for all latencies. + """ + + # We'll need to downsample the EMEG to match the function's sample rate + downsample_rate: int = int(emeg_sample_rate / function.sample_rate) + + n_samples_per_split = int(seconds_per_split * emeg_sample_rate * 2 // downsample_rate) + + if ave_mode == 'add': + n_reps = len(EMEG_paths) + else: + n_reps = 1 + + func_length = n_splits * n_samples_per_split // 2 + if func_length < function.values.shape[0]: + func = function.values[:func_length].reshape(n_splits, n_samples_per_split // 2) + print(f'WARNING: not using full 400s of the file (only using {round(n_splits * seconds_per_split, 2)}s)') + else: + func = function.values.reshape(n_splits, n_samples_per_split // 2) + n_channels = emeg_values.shape[0] + + # Reshape EMEG into splits of `seconds_per_split` s + split_initial_timesteps = [int(start_latency + round(i * 1000 * seconds_per_split * (1 + audio_shift_correction)) - emeg_t_start) + for i in range(n_splits)] + + emeg_reshaped = np.zeros((n_channels, n_splits * n_reps, n_samples_per_split)) + for j in range(n_reps): + for split_i in range(n_splits): + split_start = split_initial_timesteps[split_i] + split_stop = split_start + int(2 * emeg_sample_rate * seconds_per_split) + emeg_reshaped[:, split_i + (j * n_splits), :] = emeg_values[:, j, split_start:split_stop:downsample_rate] + + del emeg_values + + # Derangements for null distribution + derangements = np.zeros((n_derangements, n_splits * n_reps), dtype=int) + for der_i in range(n_derangements): + derangements[der_i, :] = generate_derangement(n_splits * n_reps, n_splits) + derangements = np.vstack((np.arange(n_splits * n_reps), derangements)) # Include the identity on top + + # Fast cross-correlation using FFT + emeg_reshaped = normalize(emeg_reshaped) + emeg_stds = get_stds(emeg_reshaped, n_samples_per_split // 2) + emeg_reshaped = np.fft.rfft(emeg_reshaped, n=n_samples_per_split, axis=-1) + F_func = np.conj(np.fft.rfft(normalize(func), n=n_samples_per_split, axis=-1)) + corrs = np.zeros((n_channels, n_derangements + 1, n_splits * n_reps, n_samples_per_split // 2)) + for der_i, derangement in enumerate(derangements): + deranged_emeg = emeg_reshaped[:, derangement, :] + corrs[:, der_i] = np.fft.irfft(deranged_emeg * F_func)[:, :, :n_samples_per_split//2] / emeg_stds[:, derangement] + + if add_autocorr: + auto_corrs = np.zeros((n_splits, n_samples_per_split//2)) + noise = normalize(np.random.randn(func.shape[0], func.shape[1])) * 0 + noisy_func = normalize(np.copy(func)) + noise + nn = n_samples_per_split // 2 + + F_noisy_func = np.fft.rfft(normalize(noisy_func), n=nn, axis=-1) + F_func = np.conj(np.fft.rfft(normalize(func), n=nn, axis=-1)) + + auto_corrs = np.fft.irfft(F_noisy_func * F_func) + + del F_func, deranged_emeg, emeg_reshaped + + log_pvalues = _ttest(corrs) + + latencies = np.linspace(start_latency, start_latency + (seconds_per_split * 1000), n_samples_per_split // 2 + 1)[:-1] + + if plot_name: + plt.figure(1) + corr_avrs = np.mean(corrs[:, 0]**2, axis=-2) + maxs = np.max(corr_avrs, axis=1) + n_amaxs = 5 + amaxs = np.argpartition(maxs, -n_amaxs)[-n_amaxs:] + amax = np.argmax(corr_avrs) // (n_samples_per_split // 2) + amaxs = [i for i in amaxs if i != amax] # + [209] + + plt.plot(latencies, np.mean(corrs[amax, 0], axis=-2).T, 'r-', label=amax) + plt.plot(latencies, np.mean(corrs[amaxs, 0], axis=-2).T, label=amaxs) + std_null = np.mean(np.std(corrs[:, 1], axis=-2), axis=0).T * 3 / np.sqrt(n_reps * n_splits) # 3 pop std.s + std_real = np.std(corrs[amax, 0], axis=-2).T * 3 / np.sqrt(n_reps * n_splits) + av_real = np.mean(corrs[amax, 0], axis=-2).T + #print(std_null) + plt.fill_between(latencies, -std_null, std_null, alpha=0.5, color='grey') + plt.fill_between(latencies, av_real - std_real, av_real + std_real, alpha=0.25, color='red') + + if add_autocorr: + peak_lat_ind = np.argmax(corr_avrs) % (n_samples_per_split // 2) + peak_lat = latencies[peak_lat_ind] + peak_corr = np.mean(corrs[amax, 0], axis=-2)[peak_lat_ind] + print(f'{function.name}: peak lat, peak corr, ind:', peak_lat, peak_corr, amax) + + auto_corrs = np.mean(auto_corrs, axis=0) + plt.plot(latencies, np.roll(auto_corrs, peak_lat_ind) * peak_corr / np.max(auto_corrs), 'k--', label='func auto-corr') + + plt.axvline(0, color='k') + plt.legend() + plt.xlabel('latencies (ms)') + plt.ylabel('Corr coef.') + plt.savefig(f'{plot_name}_1.png') + plt.clf() + + plt.figure(2) + plt.plot(latencies, -log_pvalues[amax].T, 'r-', label=amax) + plt.plot(latencies, -log_pvalues[amaxs].T, label=amaxs) + plt.axvline(0, color='k') + plt.legend() + plt.xlabel('latencies (ms)') + plt.ylabel('p-values') + plt.savefig(f'{plot_name}_2.png') + plt.clf() + + return + + """es = SensorExpressionSet( + functions=function.name, + latencies=latencies / 1000, + sensors=sensor_names, + data=log_pvalues, + )""" + + return es + + +def _ttest(corrs: NDArray, use_all_lats: bool = True): + """ + Vectorised Welch's t-test. + """ + n_channels, n_derangements, n_splits, t_steps = corrs.shape + + # Fisher Z-Transformation + corrs_z = 0.5 * np.log((1 + corrs) / (1 - corrs)) + + # Non-deranged values are on top + true_mean = np.mean(corrs_z[:, 0, :, :], axis=1) + true_var = np.var(corrs_z[:, 0, :, :], axis=1, ddof=1) + true_n = n_splits + + # Recompute mean and var for null correlations + # TODO: why looking at only 1 in the n_derangements dimension? + if use_all_lats: + rand_mean = np.mean(corrs_z[:, 1:, :, :].reshape(n_channels, -1), axis=1).reshape(n_channels, 1) + rand_var = np.var(corrs_z[:, 1:, :, :].reshape(n_channels, -1), axis=1, ddof=1).reshape(n_channels, 1) + rand_n = n_splits * n_derangements * t_steps + else: + rand_mean = np.mean(corrs_z[:, 1:, :, :].reshape(n_channels, -1, t_steps), axis=1) + rand_var = np.var(corrs_z[:, 1:, :, :].reshape(n_channels, -1, t_steps), axis=1, ddof=1) + rand_n = n_splits * n_derangements + + # Vectorized two-sample t-tests for all channels and time steps + numerator = true_mean - rand_mean + denominator = np.sqrt(true_var / true_n + rand_var / rand_n) + df = ((true_var / true_n + rand_var / rand_n) ** 2 / + ((true_var / true_n) ** 2 / (true_n - 1) + + (rand_var / rand_n) ** 2 / (rand_n - 1))) + + t_stat = numerator / denominator + + if np.min(df) <= 300: + log_p = np.log(stats.t.sf(np.abs(t_stat), df) * 2) # two-tailed p-value + else: + # norm v good approx for this, (logsf for t not implemented in logspace) + log_p = stats.norm.logsf(np.abs(t_stat)) + np.log(2) + + return log_p / np.log(10) # log base correction diff --git a/kymata/io/functions.py b/kymata/io/functions.py new file mode 100644 index 00000000..92f99e1c --- /dev/null +++ b/kymata/io/functions.py @@ -0,0 +1,70 @@ +from pathlib import Path + +from numpy import array, float16, convolve, mean +import numpy as np +from numpy.typing import NDArray +from scipy.io import loadmat + +from kymata.entities.functions import Function +from kymata.io.file import path_type + + +def load_function(function_path_without_suffix: path_type, func_name: str, n_derivatives: int = 0, bruce_neurons: tuple = (0, 10)) -> Function: + function_path_without_suffix = Path(function_path_without_suffix) + func: NDArray + if 'neurogram' in func_name: + print('USING BRUCE MODEL') + if function_path_without_suffix.with_suffix(".npz").exists(): + func_dict = np.load(function_path_without_suffix.with_suffix(".npz")) + func = func_dict[func_name] + func = np.mean(func[bruce_neurons[0]:bruce_neurons[1]], axis=0) + else: + mat = loadmat(function_path_without_suffix.with_suffix('.mat'))['neurogramResults'] + func = np.array(mat[func_name][0][0]) + # func = np.mean(func[bruce_neurons[0]:bruce_neurons[1]], axis=0) + tt = np.array(mat['t_'+func_name[-2:]][0][0]) + n_chans = func.shape[0] + new_mr_arr = np.zeros((n_chans, 400_000)) + new_ft_arr = np.zeros((n_chans, 400_000)) + T_end = tt[0, -1] + if func_name == 'neurogram_mr': + for i in range(n_chans): + new_mr_arr[i] = np.interp(np.linspace(0, 400, 400_000 + 1)[:-1], tt[0], func[i]) + elif func_name == 'neurogram_ft': + func = np.cumsum(func * (tt[0, 1] - tt[0, 0]), axis=-1) + for i in range(n_chans): + func_interp = np.interp(np.linspace(0, 400, 400_000 + 1), tt[0], func[i]) + new_ft_arr[i] = np.diff(func_interp, axis=-1) + + func_dict = {'neurogram_mr': new_mr_arr, + 'neurogram_ft': new_ft_arr} + np.savez(str(function_path_without_suffix.with_suffix(".npz")), **func_dict) + + func = func_dict[func_name] + func = np.mean(func[bruce_neurons[0]:bruce_neurons[1]], axis=0) + + else: + if function_path_without_suffix.with_suffix(".npz").exists(): + func_dict = np.load(str(function_path_without_suffix.with_suffix(".npz"))) + func = np.array(func_dict[func_name]) + else: + mat = loadmat(str(function_path_without_suffix.with_suffix(".mat")))['stimulisig'] + func_dict = {} + for key in mat.dtype.names: + if key == 'name': + continue + func_dict[key] = np.array(mat[key][0, 0], dtype=np.float16) + np.savez(str(function_path_without_suffix.with_suffix(".npz")), **func_dict) + func = np.array(mat[func_name][0][0]) + + if func_name in ('STL', 'IL', 'LTL'): + func = func.T + + for _ in range(n_derivatives): + func = np.convolve(func, [-1, 1], 'same') # derivative + + return Function( + name=func_name, + values=func.flatten().squeeze(), + sample_rate=1000, + ) diff --git a/kymata/io/mne.py b/kymata/io/mne.py new file mode 100644 index 00000000..bd4e1525 --- /dev/null +++ b/kymata/io/mne.py @@ -0,0 +1,67 @@ +from pathlib import Path + +from mne import read_evokeds, minimum_norm, set_eeg_reference +import numpy as np +from numpy.typing import NDArray +from os.path import isfile + +from kymata.io.file import path_type + + + +def load_single_emeg(emeg_path, need_names=False, inverse_operator=None, snr=4): + emeg_path_npy = f"{emeg_path}.npy" + emeg_path_fif = f"{emeg_path}.fif" + if isfile(emeg_path_npy) and (not need_names) and (inverse_operator is None): + ch_names: list[str] = [] # TODO: we'll need these + emeg: NDArray = np.load(emeg_path_npy) + else: + evoked = read_evokeds(emeg_path_fif, verbose=False) # should be len 1 list + if inverse_operator is not None: + lh_emeg, rh_emeg, ch_names = inverse_operate(evoked[0], inverse_operator, snr) + # TODO: I think ch_names here is the wrong thing + + emeg = None #np.concatenate((lh_emeg, rh_emeg), axis=0) + + # TODO: currently this goes OOM (node-h04 atleast): + # looks like this will be faster when split up anyway + # note, don't run the inv_op twice for rh and lh! + # TODO: move inverse operator to run after EMEG channel combination + + emeg = lh_emeg + del lh_emeg, rh_emeg + else: + emeg = evoked[0].get_data() # numpy array shape (sensor_num, N) = (370, 403_001) + ch_names = evoked[0].ch_names + emeg /= np.max(emeg, axis=1, keepdims=True) + if not isfile(emeg_path_npy): + np.save(emeg_path_npy, np.array(emeg, dtype=np.float16)) + del evoked + return emeg, ch_names + +def inverse_operate(evoked, inverse_operator, snr=4): + lambda2 = 1.0 / snr ** 2 + inverse_operator = minimum_norm.read_inverse_operator(inverse_operator, verbose=False) + set_eeg_reference(evoked, projection=True, verbose=False) + stc = minimum_norm.apply_inverse(evoked, inverse_operator, lambda2, 'MNE', pick_ori='normal', verbose=False) + return stc.lh_data, stc.rh_data, evoked.ch_names + +def load_emeg_pack(emeg_paths, need_names=False, ave_mode=None, inverse_operator=None, p_tshift=None, snr=4): # TODO: FIX PRE-AVE-NORMALISATION + if p_tshift is None: + p_tshift = [0]*len(emeg_paths) + emeg, emeg_names = load_single_emeg(emeg_paths[0], need_names, inverse_operator, snr) + emeg = emeg[:,p_tshift[0]:402001 + p_tshift[0]] + emeg = np.expand_dims(emeg, 1) + if ave_mode == 'add': + for i in range(1, len(emeg_paths)): + t_shift = p_tshift[i] + new_emeg = load_single_emeg(emeg_paths[i], need_names, inverse_operator, snr)[0][:,t_shift:402001 + t_shift] + emeg = np.concatenate((emeg, np.expand_dims(new_emeg, 1)), axis=1) + elif ave_mode == 'ave': + for i in range(1, len(emeg_paths)): + t_shift = p_tshift[i] + emeg += np.expand_dims(load_single_emeg(emeg_paths[i], need_names, inverse_operator, snr)[0][:,t_shift:402001 + t_shift], 1) + elif len(emeg_paths) > 1: + raise NotImplementedError(f'ave_mode "{ave_mode}" not known') + return emeg, emeg_names + diff --git a/kymata/math/__init__.py b/kymata/math/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kymata/math/combinatorics.py b/kymata/math/combinatorics.py new file mode 100644 index 00000000..22343425 --- /dev/null +++ b/kymata/math/combinatorics.py @@ -0,0 +1,15 @@ +from numpy import arange +from numpy.random import randint + + +def generate_derangement(n, mod=int(1e9)): # approx 3ms runtime for n=400 + while True: + v = arange(n) + for j in range(n - 1, -1, -1): + p = randint(0, j + 1) + if v[p] % mod == j % mod: + break + else: + v[j], v[p] = v[p], v[j] + else: + return v diff --git a/kymata/math/vector.py b/kymata/math/vector.py new file mode 100644 index 00000000..88b18faf --- /dev/null +++ b/kymata/math/vector.py @@ -0,0 +1,23 @@ +import numpy as np +from numpy.typing import NDArray + + +def normalize(x: NDArray) -> NDArray: + """ + Remove the mean and divide by the Euclidean magnitude. + """ + x -= np.mean(x, axis=-1, keepdims=True) + x /= np.sqrt(np.sum(x**2, axis=-1, keepdims=True)) + return x + + +def get_stds(x, n): + """ + Get the stds (times sqrt(n)) to correct for normalisation difference between each set of n_samples + """ + d0, d1, d2 = x.shape + y = np.concatenate((np.zeros((d0, d1, 1)), np.cumsum(x**2, axis=-1)), axis=-1) + z = np.concatenate((np.zeros((d0, d1, 1)), np.cumsum(x, axis=-1)), axis=-1) + y = y[:,:,-n-1:-1] - y[:,:,:n] + z = z[:,:,-n-1:-1] - z[:,:,:n] + return (y - ((z**2) / n)) ** 0.5 diff --git a/kymata/preproc/data_cleansing.py b/kymata/preproc/data_cleansing.py old mode 100644 new mode 100755 index ca7c2321..75654835 --- a/kymata/preproc/data_cleansing.py +++ b/kymata/preproc/data_cleansing.py @@ -1,6 +1,6 @@ import os.path -from colorama import Fore +from colorama import Fore, Style import mne from numpy import zeros, nanmin from pandas import DataFrame, Index @@ -283,7 +283,6 @@ def _remove_ecg(filt_raw, ica): # heartbeats ica.plot_overlay(filt_raw, exclude=ica.exclude, picks='mag') - def _remove_ecg_eog(filt_raw, ica): """ Note: mutates `ica`. @@ -298,6 +297,122 @@ def _remove_ecg_eog(filt_raw, ica): ica.plot_scores(eog_scores) +def create_trialwise_data(dataset_directory_name: str, + list_of_participants: list[str], + repetitions_per_runs: int, + number_of_runs: int, + number_of_trials: int, + input_streams: list[str], + eeg_thresh: float, + grad_thresh: float, + mag_thresh: float, + visual_delivery_latency: float, + audio_delivery_latency: float, + audio_delivery_shift_correction: float, # seconds + tmin: float, + tmax: float, + ): + """Create trials objects from the raw data files (still in sensor space)""" + + global_droplog = [] + + data_path = '/imaging/projects/cbu/kymata/data/' + dataset_directory_name + save_folder = '3_trialwise_sensorspace' + + print(f"{Fore.GREEN}{Style.BRIGHT}Starting trials and {Style.RESET_ALL}") + + for p in list_of_participants: + + print(f"{Fore.GREEN}{Style.BRIGHT}...Concatenating trials{Style.RESET_ALL}") + + cleaned_raws = [] + + for run in range(1, number_of_runs + 1): + raw_fname = f'{data_path}/intrim_preprocessing_files/2_cleaned/{p}_run{run}_cleaned_raw.fif.gz' + if os.path.isfile(raw_fname): + raw = mne.io.Raw(raw_fname, preload=True) + events_ = mne.find_events(raw, stim_channel='STI101', shortest_event=1) + #print([i[0] for i in mne.pick_events(events_, include=2)]) + #print(mne.pick_events(events_, include=3)) + cleaned_raws.append(raw) + + raw = mne.io.concatenate_raws(raws=cleaned_raws, preload=True) + raw_events = mne.find_events(raw, stim_channel='STI101', shortest_event=1) #, uint_cast=True) + + print(f"{Fore.GREEN}{Style.BRIGHT}...finding visual events{Style.RESET_ALL}") + + # Extract visual events + visual_events = mne.pick_events(raw_events, include=[2, 3]) + + # Correct for visual equiptment latency error + visual_events = mne.event.shift_time_events(visual_events, [2, 3], visual_delivery_latency, 1) + + # trigger_name = 1 + # for trigger in visual_events: + # trigger[2] = trigger_name # rename as '1,2,3 ...400' etc + # if trigger_name == 400: + # trigger_name = 1 + # else: + # trigger_name = trigger_name + 1 + # + # # Test there are the correct number of events + # assert visual_events.shape[0] == repetitions_per_runs * number_of_runs * number_of_trials + + print(f"{Fore.GREEN}{Style.BRIGHT}...finding audio events{Style.RESET_ALL}") + + # Extract audio events + audio_events_raw = mne.pick_events(raw_events, include=3) + + # Correct for audio latency error + audio_events_raw = mne.event.shift_time_events(audio_events_raw, [3], audio_delivery_latency, 1) + + audio_events = zeros((len(audio_events_raw) * 400, 3), dtype=int) + for run, item in enumerate(audio_events_raw): + print('item: ', item) + audio_events_raw[run][2] = run # rename the audio events raw to pick apart later + for trial in range(number_of_trials): + audio_events[(trial + (number_of_trials * run))][0] = item[0] + round( + trial * (1000 + audio_delivery_shift_correction)) + audio_events[(trial + (number_of_trials * run))][1] = 0 + audio_events[(trial + (number_of_trials * run))][2] = trial + + # Test there are the correct number of events + assert audio_events.shape[0] == repetitions_per_runs * number_of_runs * number_of_trials # TODO handle overlapping visual/audio triggers + + print(f'\n Runs found: {audio_events.shape[0]} \n') + + # Denote picks + include = []; # ['MISC006'] # MISC05, trigger channels etc, if needed + picks = mne.pick_types(raw.info, meg=True, eeg=True, stim=False, exclude='bads', include=include) + + print(f"{Fore.GREEN}{Style.BRIGHT}... extract and save evoked data{Style.RESET_ALL}") + + for input_stream in input_streams: # TODO n.b. not setup for visual/tactile stream yet + if input_stream == 'auditory': + events = audio_events + #else: + # events = visual_events + + # Extract trial instances ('epochs') + + _tmin = -0.2 + _tmax = 400 + 0.8 + 2 # Extra space to account for audio file latency correction + epochs = mne.Epochs(raw, audio_events_raw, None, _tmin, _tmax, picks=picks, + baseline=(None, None), reject=dict(eeg=eeg_thresh, grad=grad_thresh, mag=mag_thresh), + preload=True) + + # Log which channels are worst + dropfig = epochs.plot_drop_log(subject=p) + dropfig.savefig(f'{data_path}/intrim_preprocessing_files/{save_folder}/logs/f{input_stream}_drop-log_{p}.jpg') + + global_droplog.append(f'[{input_stream}]{p}:{epochs.drop_log_stats(epochs.drop_log)}') + + for i in range(len(audio_events_raw)): + evoked = epochs[str(i)].average() + evoked.save(f'{data_path}/intrim_preprocessing_files/{save_folder}/evoked_data/{p}_rep{i}.fif', overwrite=True) + + evoked = epochs.average() + evoked.save(f'{data_path}/intrim_preprocessing_files/{save_folder}/evoked_data/{p}-ave.fif', overwrite=True) def create_trials(dataset_directory_name: str, @@ -311,7 +426,7 @@ def create_trials(dataset_directory_name: str, mag_thresh: float, visual_delivery_latency: float, audio_delivery_latency: float, - audio_delivery_shift_correction: float, + audio_delivery_shift_correction: float, # seconds tmin: float, tmax: float, ): @@ -367,7 +482,7 @@ def create_trials(dataset_directory_name: str, for run, item in enumerate(audio_events_raw): for trial in range(1, number_of_trials + 1): audio_events[(trial + (number_of_trials * run)) - 1][0] = item[0] + round( - (trial - 1) * (1000 + audio_delivery_shift_correction)) + (trial - 1) * 1000 * (1 + audio_delivery_shift_correction)) audio_events[(trial + (number_of_trials * run)) - 1][1] = 0 audio_events[(trial + (number_of_trials * run)) - 1][2] = trial diff --git a/submit_gridsearch.sh b/submit_gridsearch.sh new file mode 100755 index 00000000..41763242 --- /dev/null +++ b/submit_gridsearch.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +### +# To run gridsearch on the queue at the CBU, run the following command in command line: +# sbatch submit_gridsearch.sh +### + + +#SBATCH --job-name=gridsearch +#SBATCH --output=slurm_log.txt +#SBATCH --error=slurm_log.txt +#SBATCH --ntasks=1 +#SBATCH --time=05:00:00 +#SBATCH --mem=1000 +#SBATCH --array=1-1 +#SBATCH --exclusive + +conda activate mne_venv + +args=(5) # 2 3 4 5 6 7 8 9 10) +ARG=${args[$SLURM_ARRAY_TASK_ID - 1]} + +python invokers/run_gridsearch.py + # --snr $ARG # >> result3.txt + +conda deactivate