mne-tools · CarinaFo · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
@@ -65,3 +65,4 @@ dependencies:
   - lazy_loader
   - defusedxml
   - python-neo
+  - formulaic
diff --git a/mne/datasets/config.py b/mne/datasets/config.py
@@ -90,7 +90,7 @@
 # here: ↓↓↓↓↓↓↓↓
 RELEASES = dict(
     testing="0.152",
-    misc="0.27",
+    misc="0.30",
     phantom_kit="0.2",
     ucl_opm_auditory="0.2",
 )
@@ -131,7 +131,7 @@
 )
 MNE_DATASETS["misc"] = dict(
     archive_name=f"{MISC_VERSIONED}.tar.gz",  # 'mne-misc-data',
-    hash="md5:e343d3a00cb49f8a2f719d14f4758afe",
+    hash="md5:201d35531d3c03701cf50e38bb73481f",
     url=(
         "https://codeload.github.com/mne-tools/mne-misc-data/tar.gz/"
         f'{RELEASES["misc"]}'

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
@@ -6,16 +6,22 @@
 #          Eric Larson <[email protected]>
 #          Denis Engemann <[email protected]>
 #          Fernando Perez (bin_perm_rep function)
+#          Carina Forster <[email protected]>
 #
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
+import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
+from mpl_toolkits.axes_grid1 import make_axes_locatable
 from scipy import ndimage, sparse
 from scipy.sparse.csgraph import connected_components
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
+from .. import EvokedArray
+from ..channels import find_ch_adjacency
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
@@ -24,13 +30,15 @@
     ProgressBar,
     _check_option,
     _pl,
+    _soft_import,
     _validate_type,
     check_random_state,
     logger,
     split_list,
     verbose,
     warn,
 )
+from ..viz import plot_compare_evokeds
 from .parametric import f_oneway, ttest_1samp_no_p
 
 
@@ -1729,3 +1737,310 @@ def summarize_clusters_stc(
     data_summary[:, 0] = np.sum(data_summary, axis=1)
 
     return klass(data_summary, vertices, tmin, tstep, subject)
+
+
+def cluster_test(
+    df: pd.DataFrame,
+    formula: str = None,  # Wilkinson notation formula for design matrix
+    n_permutations: int = 10000,
+    seed: None | int | np.random.RandomState = None,
+    tail: int = 0,  # 0 for two-tailed, 1 for greater, -1 for less
+    n_jobs: int = 1,  # how many cores to use
+    adjacency: tuple = None,
+    max_step: int = 1,  # maximum distance between samples (time points)
+    exclude: list = None,  # exclude no time points or channels
+    step_down_p: int = 0,  # step down in jumps test
+    t_power: int = 1,  # weigh each location by its stats score
+    out_type: str = "indices",
+    check_disjoint: bool = False,
+    buffer_size: int = None,  # block size for chunking the data
+):
+    """
+    Run a cluster permutation test based on formulaic input.
+
+    # currently only supports paired t-test on evokeds or epochs
+
+    Parameters
+    ----------
+    dataframe : pd.DataFrame
+        Dataframe with evoked/epoched data, conditions and subject IDs.
+    formula : str, optional
+        Wilkinson notation formula for design matrix. Default is None.
+    n_permutations : int, optional
+        Number of permutations. Default is 10000.
+    seed : None | int | np.random.RandomState, optional
+        Seed for the random number generator. Default is None.
+    tail : int, optional
+        0 for two-tailed, 1 for greater, -1 for less. Default is 0.
+    n_jobs : int, optional
+        How many cores to use. Default is 1.
+    adjacency : None, optional
+        Adjacency matrix. Default is None.
+    max_step : int, optional
+        Maximum distance between samples (time points). Default is 1.
+    exclude : np.Array, optional
+        Exclude no time points or channels. Default is None.
+    step_down_p : int, optional
+        Step down in jumps test. Default is 0.
+    t_power : int, optional
+        Weigh each location by its stats score. Default is 1.
+    out_type : str, optional
+        Output type. Default is "indices".
+    check_disjoint : bool, optional
+        Check if clusters are disjoint. Default is False.
+    buffer_size : int, optional
+        Block size for chunking the data. Default is None.
+    seed : int, optional
+        Seed for the random number generator. Default is None.
+
+    Returns
+    -------
+    ClusterResult
+        Object containing the results of the cluster permutation test.
+    """
+    # for now this assumes a dataframe with a column for evoked data or epochs
+    # add a data column to the dataframe (numpy array)
+    df["data"] = [evoked.data for evoked in df.evoked]
+
+    # extract number of channels and timepoints
+    # (eventually should also allow for frequency)
+    n_channels, n_timepoints = df["data"][0].shape
+
+    # convert wide format to long format for formulaic
+    df_long = unpack_time_and_channels(df)
+
+    # pivot the DataFrame
+    pivot_df = df_long.pivot_table(
+        index=["subject_index", "channel", "timepoint"],
+        columns="condition",
+        values="value",
+    ).reset_index()
+
+    # if not 2 unique conditions raise error
+    if len(pd.unique(df.condition)) != 2:
+        raise ValueError("Condition list needs to contain 2 unique values")
+
+    # Get unique elements and the indices of their first occurrences
+    unique_elements, indices = np.unique(df.condition, return_index=True)
+
+    # Sort unique elements by the indices of their first occurrences
+    conditions = unique_elements[np.argsort(indices)]
+
+    # print the contrast used for the paired t-test
+    print(f"Contrast used for paired t-test: {conditions[0]} - {conditions[1]}")
+
+    # Compute the difference (assuming there are only 2 conditions)
+    pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
+
+    # Optional: Clean up the DataFrame
+    pivot_df = pivot_df[["subject_index", "channel", "timepoint", "evoked"]]
+
+    # check if formula is present
+    if formula is not None:
+        formulaic = _soft_import(
+            "formulaic", purpose="set up Design Matrix"
+        )  # soft import (not a dependency for MNE)
+
+        # for the paired t-test y is the difference between conditions
+        # X is the design matrix with a column with 1s and 0s for each participant
+        # Create the design matrix using formulaic
+        y, X = formulaic.model_matrix(formula, pivot_df)
+    else:
+        raise ValueError(
+            "Formula is required and needs to be a string in Wilkinson notation."
+        )
+
+    # now prep design matrix for input into MNE cluster function
+    # cluster functions expects channels as list dimension
+    y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
+
+    adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
+
+    # define stat function and threshold
+    stat_fun, threshold = _check_fun(
+        X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
+    )
+
+    # Run the cluster-based permutation test
+    T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
+        [y_for_cluster],
+        n_permutations=n_permutations,
+        threshold=threshold,
+        stat_fun=stat_fun,
+        tail=tail,
+        n_jobs=n_jobs,
+        adjacency=adjacency,
+        max_step=max_step,  # maximum distance between samples (time points)
+        exclude=exclude,  # exclude no time points or channels
+        step_down_p=step_down_p,  # step down in jumps test
+        t_power=t_power,  # weigh each location by its stats score
+        out_type=out_type,
+        check_disjoint=check_disjoint,
+        buffer_size=buffer_size,  # block size for chunking the data
+        seed=seed,
+    )
+
+    print(f"smallest cluster p-value: {min(cluster_p_values)}")
+
+    return ClusterResult(T_obs, clusters, cluster_p_values, H0)
+
+
+def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame:
+    """
+    Extract timepoints and channels and convert to long.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame in wide format.
+
+    Returns
+    -------
+    df_long : pd.DataFrame
+        DataFrame in long format.
+    """
+    # Extracting all necessary data using list comprehensions for better performance
+    long_format_data = [
+        {
+            "condition": row["condition"],
+            "subject_index": row["subject_index"],
+            "channel": channel,
+            "timepoint": timepoint,
+            "value": row["data"][channel, timepoint],
+        }
+        for idx, row in df.iterrows()
+        for channel in range(row["data"].shape[0])
+        for timepoint in range(row["data"].shape[1])
+    ]
+
+    # Creating the long format DataFrame
+    df_long = pd.DataFrame(long_format_data)
+
+    return df_long
+
+
+class ClusterResult:
+    """
+    Object containing the results of the cluster permutation test.
+
+    Parameters
+    ----------
+    T_obs : np.ndarray
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : np.ndarray
+        P-values for each cluster.
+    H0 : np.ndarray
+        Max cluster level stats observed under permutation.
+    """
+
+    def __init__(self, T_obs, clusters, cluster_p_values, H0):
+        self.T_obs = T_obs
+        self.clusters = clusters
+        self.cluster_p_values = cluster_p_values
+        self.H0 = H0
+
+    def plot_cluster(self, cond_dict: dict = None):
+        """
+        Plot the cluster with the lowest p-value.
+
+        2D cluster plotted with topoplot on the left and evoked signals on the right.
+        Timepoints that are part of the cluster are
+        highlighted in green on the evoked signals.
+
+        Parameters
+        ----------
+        cond_dict : dict
+            Dictionary with condition labels as keys and evoked objects as values.
+
+        Returns
+        -------
+        None
+
+        """
+        # extract condition labels from the dictionary
+        cond_keys = list(cond_dict.keys())
+        # extract the evokeds from the dictionary
+        cond_values = list(cond_dict.values())
+
+        # configure variables for visualization
+        colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
+
+        lowest_p_cluster = np.argmin(self.cluster_p_values)
+
+        # plot the cluster with the lowest p-value
+        time_inds, space_inds = np.squeeze(self.clusters[lowest_p_cluster])
+        ch_inds = np.unique(space_inds)
+        time_inds = np.unique(time_inds)
+
+        # get topography for t stat
+        t_map = self.T_obs[time_inds, ...].mean(axis=0).astype(int)
+
+        # get signals at the sensors contributing to the cluster
+        sig_times = cond_values[0][0].times[time_inds]
+
+        # create spatial mask
+        mask = np.zeros((t_map.shape[0], 1), dtype=bool)
+        mask[ch_inds, :] = True
+
+        # initialize figure
+        fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
+
+        # plot average test statistic and mark significant sensors
+        t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0)
+        t_evoked.plot_topomap(
+            times=0,
+            mask=mask,
+            axes=ax_topo,
+            cmap="RdBu_r",
+            show=False,
+            colorbar=False,
+            mask_params=dict(markersize=10),
+            scalings=1.00,
+        )
+        image = ax_topo.images[0]
+
+        # remove the title that would otherwise say "0.000 s"
+        ax_topo.set_title("")
+
+        # soft import?
+        # make_axes_locatable = _soft_import(
+        #    "mpl_toolkits.axes_grid1.make_axes_locatable",
+        #    purpose="plot cluster results"
+        # )  # soft import (not a dependency for MNE)
+
+        # create additional axes (for ERF and colorbar)
+        divider = make_axes_locatable(ax_topo)
+
+        # add axes for colorbar
+        ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
+        cbar = plt.colorbar(image, cax=ax_colorbar)
+        cbar.set_label("t-value")
+        ax_topo.set_xlabel(
+            "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]])
+        )
+
+        # add new axis for time courses and plot time courses
+        ax_signals = divider.append_axes("right", size="300%", pad=1.3)
+        title = f"Signal averaged over {len(ch_inds)} sensor(s)"
+        plot_compare_evokeds(
+            cond_dict,
+            title=title,
+            picks=ch_inds,
+            axes=ax_signals,
+            colors=colors,
+            show=False,
+            split_legend=True,
+            truncate_yaxis="auto",
+            truncate_xaxis=False,
+        )
+        plt.legend(frameon=False, loc="upper left")
+
+        # plot temporal cluster extent
+        ymin, ymax = ax_signals.get_ylim()
+        ax_signals.fill_betweenx(
+            (ymin, ymax), sig_times[0], sig_times[-1], color="grey", alpha=0.3
+        )
+
+        plt.show()
@@ -107,6 +107,7 @@ full-no-qt = [
     "snirf",
     "defusedxml",
     "neo",
+    "formulaic",
 ]
 full = ["mne[full-no-qt]", "PyQt6!=6.6.0", "PyQt6-Qt6!=6.6.0,!=6.7.0"]
 full-pyqt6 = ["mne[full]"]
@@ -145,6 +146,7 @@ test_extra = [
     "snirf",
     "neo",
     "mne-bids",
+    "formulaic",
 ]
 
 # Dependencies for building the documentation
@@ -157,6 +159,7 @@ doc = [
     "sphinxcontrib-towncrier",
     "memory_profiler",
     "neo",
+    "formulaic",
     "seaborn!=0.11.2",
     "sphinx_copybutton",
     "sphinx-design",