diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py new file mode 100644 index 0000000..56a08b0 --- /dev/null +++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py @@ -0,0 +1,166 @@ +# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- +# ex: set sts=4 ts=4 sw=4 et: +""" +.. _meta3: + +================================================ + Run mega- and meta-analysis +================================================ + +Steps: + +1. Make a toy dataset +2. Run mega-analysis (linear mixed effects model with random intercepts for study) +3. Group dataset by study and run OLS on each study separately to construct meta-analysis dataset +4. Run meta-analysis with DerSimonian-Laird between-study variance estimator +""" +from pprint import pprint + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import statsmodels.api as sm + +from pymare import Dataset +from pymare.estimators import DerSimonianLaird + +sns.set_theme(style="ticks") + +############################################################################### +# Wrangle some example data +# ----------------------------------------------------------------------------- +# For this example, we will use the "American National Election Survey 1996" +# dataset from statsmodels. +# This dataset includes several elements we care about, including: +# +# 1. A large number of observations (944). +# 2. A variable that can be used as a substitute for "study". +# In this case, we will pretend that all observations with similar +# populations come from the same cities, which we will in turn pretend +# reflect different studies of the same phenomenon by different research +# groups. +data = sm.datasets.anes96.load_pandas().data +data.head(10) + +############################################################################### +# Convert the populations to "studies" by binning similar populations and +# assigning them to the same "study" values. + +# I selected the first study with a first author starting with each letter from Neurosynth. +# I figured it would give the dataset some verisimilitude. +study_names = [ + "Aarts & Roelofs (2011)", + "Baas et al. (2008)", + "Cabanis et al. (2013)", + "D'Agata et al. (2011)", + "Eack et al. (2008)", + "Fabbri, Caramazza, & Lingnau (2012)", + "Gaab, Gabrieli, & Glover (2007)", + "Haag et al. (2014)", +] + +data = data.sort_values(by="popul") +group_assignments = [] +split_assignments = np.array_split(np.arange(data.shape[0], dtype=int), 8) +for i_split, split in enumerate(split_assignments): + group_assignments += [study_names[i_split]] * len(split) + +data["study"] = group_assignments +data.head(10) + +############################################################################### +fig, ax = plt.subplots(figsize=(16, 8)) +sns.stripplot(data=data, x="logpopul", y="study", ax=ax) +fig.tight_layout() +fig.show() + +############################################################################### +# The variables of interest +# ----------------------------------------------------------------------------- +# First, let's talk about the variables of interest in this analysis. +# The variables are: +# +# 1. TVnews: The number of times, per week, that the respondent watches the news on TV. +# 2. age: The age of the respondent, in years. +# 3. educ: The education of the respondent, binned into the following groups: +# (1) Grade 1-8, (2) Some high school, (3) High school graduate, (4) Some college, +# (5) College graduate, (6) Master's degree, and (7) PhD. +# We will evaluate linear relationships with this coded variable, which isn't +# optimal, but this is just an example, so... ¯\\_(ツ)_/¯ +# +# If we visualize the distributions of the different variables, we'll see that +# they're not exactly normally distributed... +sns.pairplot(data[["study", "TVnews", "age", "educ"]], hue="study") + +############################################################################### +# The mega-analysis +# ----------------------------------------------------------------------------- +# Random intercepts model using study. +# In this model, we test the question, "to what extent do age and education +# predict TV news watching?" +# +# We assume that data from the same study may have different baseline news-watching levels +# (i.e., random intercepts). +# +# Do age and education predict TV news watching? +model = sm.MixedLM.from_formula("TVnews ~ age + educ", data, groups=data["study"]) +fitted_model = model.fit() +print(fitted_model.summary()) + +############################################################################### +# Create the meta-analysis dataset +# ----------------------------------- +# We assume that each study performed their own analyses testing the same hypothesis. +# The hypothesis, in this case, is "to what extent do age and education predict TV news watching"? +# +# The model used will, for our example, be exactly the same across studies. +# This model is just a GLM with age, education, and an intercept predicting the number of hours. +# Individual studies would then report the parameter estimate and variance for each term in the +# model. +data["intercept"] = 1 +target_vars = ["educ", "age", "intercept"] + +# calculate coefficient and variance for each variable of interest +meta_df = [] +for study_name, study_df in data.groupby("study"): + model = sm.OLS(study_df["TVnews"], study_df[target_vars]) + fitted_model = model.fit() + + # extract parameter estimates and errors as Series + coefficients = fitted_model.params + std_errors = fitted_model.bse + + # convert standard error to sampling variance + sampling_variances = std_errors ** 2 + names = [n + "_var" for n in sampling_variances.index.tolist()] + sampling_variances.index = names + + # combine Series and convert to DataFrame + coefficients = coefficients.append(sampling_variances) + coefficients["study"] = study_name + coefficients["sample_size"] = study_df.shape[0] + temp_df = pd.DataFrame(coefficients).T + meta_df.append(temp_df) + +# Combine across studies and convert objects to floats +meta_df = pd.concat(meta_df).reset_index(drop=True) +meta_df = meta_df.convert_dtypes() +meta_df + +############################################################################### +# The meta-analysis +# -------------------------------------------- +# Are age and education significant predictors of TV news watching across the literature? +metamodel = DerSimonianLaird() +dset = Dataset( + y=meta_df[["age", "educ"]].values.astype(float), + v=meta_df[["age_var", "educ_var"]].values.astype(float), + add_intercept=True, +) +metamodel.fit_dataset(dset) + +############################################################################### +summary = metamodel.summary() +results = summary.get_fe_stats() +pprint(results) diff --git a/setup.cfg b/setup.cfg index a71d2e7..2fa821c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,6 +60,7 @@ doc = sphinx_gallery==0.10.1 sphinx_rtd_theme sphinxcontrib-bibtex + statsmodels tests = codecov coverage