From 8d481964a2a31272fb596690f5b3d16df4448831 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sat, 18 Dec 2021 14:56:47 -0500 Subject: [PATCH 1/7] Add mega-analysis example. --- .../plot_compare_mega_and_meta-analysis.py | 106 ++++++++++++++++++ setup.cfg | 1 + 2 files changed, 107 insertions(+) create mode 100644 examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py new file mode 100644 index 0000000..5102f06 --- /dev/null +++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py @@ -0,0 +1,106 @@ +# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- +# ex: set sts=4 ts=4 sw=4 et: +""" +.. _meta3: + +================================================ + Run mega- and meta-analysis +================================================ + +Steps: + +1. Make a toy dataset +1. Run mega-analysis (linear mixed effects model with random intercepts for site) +1. Group dataset by site and run OLS on each site separately to construct derived toy + meta-analysis dataset +1. Run meta-analysis with DerSimonian-Laird between-study variance estimator +""" +import numpy as np +import statsmodels.api as sm + +############################################################################### +# Wrangle some example data +# ----------------------------------------------------------------------------- +data = sm.datasets.anes96.load_pandas().data +data.head() + +############################################################################### +dat = data["popul"] +bins = np.linspace(0, np.max(dat) + 1, 20) +digitized = np.digitize(dat, bins) +idx = {i: np.where(digitized == i)[0] for i in range(1, len(bins))} +idx = {k: v for k, v in idx.items() if v.size} + +# Assign "site" based on grouped populations +data["site"] = 0 +i = 0 +letters = ["a", "b", "c", "d", "e", "f", "g", "h"] +for k, v in idx.items(): + data.loc[v, "site"] = letters[i] + i += 1 + +data.head() + +############################################################################### +# The mega-analysis +# ----------------------------------------------------------------------------- +# Random intercepts model using site +# +# Do age and education predict TV news watching? +model = sm.MixedLM.from_formula("TVnews ~ age + educ", data, groups=data["site"]) +fitted_model = model.fit() +print(fitted_model.summary()) + +############################################################################### +# Create the meta-analysis dataset +# ----------------------------------- +import pandas as pd + +data["intercept"] = 1 +target_vars = ["educ", "age", "intercept"] + +# calculate mean and variance for each variance of interest +meta_df = [] +for site_name, site_df in data.groupby("site"): + model = sm.OLS(site_df["TVnews"], site_df[target_vars]) + fitted_model = model.fit() + + # extract parameter estimates and errors as Series + coefficients = fitted_model.params + std_errors = fitted_model.bse + + # convert standard error to sampling variance + sampling_variances = std_errors ** 2 + names = [n + "_var" for n in sampling_variances.index.tolist()] + sampling_variances.index = names + + # combine Series and convert to DataFrame + coefficients = coefficients.append(sampling_variances) + coefficients["site"] = site_name + coefficients["sample_size"] = site_df.shape[0] + temp_df = pd.DataFrame(coefficients).T + meta_df.append(temp_df) +# Combine across sites and convert objects to floats +meta_df = pd.concat(meta_df).reset_index(drop=True) +meta_df = meta_df.convert_dtypes() +print(meta_df.to_markdown()) + +from pymare import Dataset + +############################################################################### +# The meta-analysis +# -------------------------------------------- +# Are age and education significant predictors of TV news watching across the literature? +from pymare.estimators import DerSimonianLaird + +metamodel = DerSimonianLaird() +dset = Dataset( + y=meta_df[["age", "educ"]].values, + v=meta_df[["age_var", "educ_var"]].values, + add_intercept=True, +) +metamodel.fit_dataset(dset) + +############################################################################### +summary = metamodel.summary() +summary.get_fe_stats() diff --git a/setup.cfg b/setup.cfg index 5d281de..65e0d91 100644 --- a/setup.cfg +++ b/setup.cfg @@ -59,6 +59,7 @@ doc = sphinx-copybutton sphinx_gallery==0.10.1 sphinx_rtd_theme + statsmodels tests = codecov coverage From 3159975e8b10317348d653af37ae80dceb68f4d2 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sat, 18 Dec 2021 15:00:39 -0500 Subject: [PATCH 2/7] Update setup.cfg --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 65e0d91..d6e2790 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,6 +60,7 @@ doc = sphinx_gallery==0.10.1 sphinx_rtd_theme statsmodels + tabulate tests = codecov coverage From 0cc3330e63b80ae9072e26997f16eaef7cd3bb3c Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sat, 18 Dec 2021 15:06:44 -0500 Subject: [PATCH 3/7] Update plot_compare_mega_and_meta-analysis.py --- .../plot_compare_mega_and_meta-analysis.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py index 5102f06..20cccf3 100644 --- a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py +++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py @@ -11,8 +11,7 @@ 1. Make a toy dataset 1. Run mega-analysis (linear mixed effects model with random intercepts for site) -1. Group dataset by site and run OLS on each site separately to construct derived toy - meta-analysis dataset +1. Group dataset by site and run OLS on each site separately to construct meta-analysis dataset 1. Run meta-analysis with DerSimonian-Laird between-study variance estimator """ import numpy as np @@ -80,6 +79,7 @@ coefficients["sample_size"] = site_df.shape[0] temp_df = pd.DataFrame(coefficients).T meta_df.append(temp_df) + # Combine across sites and convert objects to floats meta_df = pd.concat(meta_df).reset_index(drop=True) meta_df = meta_df.convert_dtypes() @@ -95,8 +95,8 @@ metamodel = DerSimonianLaird() dset = Dataset( - y=meta_df[["age", "educ"]].values, - v=meta_df[["age_var", "educ_var"]].values, + y=meta_df[["age", "educ"]].values.astype(float), + v=meta_df[["age_var", "educ_var"]].values.astype(float), add_intercept=True, ) metamodel.fit_dataset(dset) From 3f4cafc8d744467462cdcf38f91eb77156aa7c67 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sat, 18 Dec 2021 15:09:24 -0500 Subject: [PATCH 4/7] Fix things. --- .../plot_compare_mega_and_meta-analysis.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py index 20cccf3..dddbde2 100644 --- a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py +++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py @@ -10,9 +10,9 @@ Steps: 1. Make a toy dataset -1. Run mega-analysis (linear mixed effects model with random intercepts for site) -1. Group dataset by site and run OLS on each site separately to construct meta-analysis dataset -1. Run meta-analysis with DerSimonian-Laird between-study variance estimator +2. Run mega-analysis (linear mixed effects model with random intercepts for site) +3. Group dataset by site and run OLS on each site separately to construct meta-analysis dataset +4. Run meta-analysis with DerSimonian-Laird between-study variance estimator """ import numpy as np import statsmodels.api as sm @@ -85,12 +85,11 @@ meta_df = meta_df.convert_dtypes() print(meta_df.to_markdown()) -from pymare import Dataset - ############################################################################### # The meta-analysis # -------------------------------------------- # Are age and education significant predictors of TV news watching across the literature? +from pymare import Dataset from pymare.estimators import DerSimonianLaird metamodel = DerSimonianLaird() From 6e1f584fb3465d323d8a25af5417eb0e64af7616 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sat, 18 Dec 2021 15:10:22 -0500 Subject: [PATCH 5/7] Update plot_compare_mega_and_meta-analysis.py --- .../plot_compare_mega_and_meta-analysis.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py index dddbde2..ce77fca 100644 --- a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py +++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py @@ -83,7 +83,7 @@ # Combine across sites and convert objects to floats meta_df = pd.concat(meta_df).reset_index(drop=True) meta_df = meta_df.convert_dtypes() -print(meta_df.to_markdown()) +meta_df ############################################################################### # The meta-analysis @@ -101,5 +101,8 @@ metamodel.fit_dataset(dset) ############################################################################### +from pprint import pprint + summary = metamodel.summary() -summary.get_fe_stats() +results = summary.get_fe_stats() +pprint(results) From 692f4df59702c3a4113cfa51fc8eded807df8561 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sun, 19 Dec 2021 14:47:13 -0500 Subject: [PATCH 6/7] Improve example. Still holding off on random slopes. --- .../plot_compare_mega_and_meta-analysis.py | 124 +++++++++++++----- 1 file changed, 91 insertions(+), 33 deletions(-) diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py index ce77fca..56a08b0 100644 --- a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py +++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py @@ -10,58 +10,121 @@ Steps: 1. Make a toy dataset -2. Run mega-analysis (linear mixed effects model with random intercepts for site) -3. Group dataset by site and run OLS on each site separately to construct meta-analysis dataset +2. Run mega-analysis (linear mixed effects model with random intercepts for study) +3. Group dataset by study and run OLS on each study separately to construct meta-analysis dataset 4. Run meta-analysis with DerSimonian-Laird between-study variance estimator """ +from pprint import pprint + +import matplotlib.pyplot as plt import numpy as np +import pandas as pd +import seaborn as sns import statsmodels.api as sm +from pymare import Dataset +from pymare.estimators import DerSimonianLaird + +sns.set_theme(style="ticks") + ############################################################################### # Wrangle some example data # ----------------------------------------------------------------------------- +# For this example, we will use the "American National Election Survey 1996" +# dataset from statsmodels. +# This dataset includes several elements we care about, including: +# +# 1. A large number of observations (944). +# 2. A variable that can be used as a substitute for "study". +# In this case, we will pretend that all observations with similar +# populations come from the same cities, which we will in turn pretend +# reflect different studies of the same phenomenon by different research +# groups. data = sm.datasets.anes96.load_pandas().data -data.head() +data.head(10) + +############################################################################### +# Convert the populations to "studies" by binning similar populations and +# assigning them to the same "study" values. + +# I selected the first study with a first author starting with each letter from Neurosynth. +# I figured it would give the dataset some verisimilitude. +study_names = [ + "Aarts & Roelofs (2011)", + "Baas et al. (2008)", + "Cabanis et al. (2013)", + "D'Agata et al. (2011)", + "Eack et al. (2008)", + "Fabbri, Caramazza, & Lingnau (2012)", + "Gaab, Gabrieli, & Glover (2007)", + "Haag et al. (2014)", +] + +data = data.sort_values(by="popul") +group_assignments = [] +split_assignments = np.array_split(np.arange(data.shape[0], dtype=int), 8) +for i_split, split in enumerate(split_assignments): + group_assignments += [study_names[i_split]] * len(split) + +data["study"] = group_assignments +data.head(10) + +############################################################################### +fig, ax = plt.subplots(figsize=(16, 8)) +sns.stripplot(data=data, x="logpopul", y="study", ax=ax) +fig.tight_layout() +fig.show() ############################################################################### -dat = data["popul"] -bins = np.linspace(0, np.max(dat) + 1, 20) -digitized = np.digitize(dat, bins) -idx = {i: np.where(digitized == i)[0] for i in range(1, len(bins))} -idx = {k: v for k, v in idx.items() if v.size} - -# Assign "site" based on grouped populations -data["site"] = 0 -i = 0 -letters = ["a", "b", "c", "d", "e", "f", "g", "h"] -for k, v in idx.items(): - data.loc[v, "site"] = letters[i] - i += 1 - -data.head() +# The variables of interest +# ----------------------------------------------------------------------------- +# First, let's talk about the variables of interest in this analysis. +# The variables are: +# +# 1. TVnews: The number of times, per week, that the respondent watches the news on TV. +# 2. age: The age of the respondent, in years. +# 3. educ: The education of the respondent, binned into the following groups: +# (1) Grade 1-8, (2) Some high school, (3) High school graduate, (4) Some college, +# (5) College graduate, (6) Master's degree, and (7) PhD. +# We will evaluate linear relationships with this coded variable, which isn't +# optimal, but this is just an example, so... ¯\\_(ツ)_/¯ +# +# If we visualize the distributions of the different variables, we'll see that +# they're not exactly normally distributed... +sns.pairplot(data[["study", "TVnews", "age", "educ"]], hue="study") ############################################################################### # The mega-analysis # ----------------------------------------------------------------------------- -# Random intercepts model using site +# Random intercepts model using study. +# In this model, we test the question, "to what extent do age and education +# predict TV news watching?" +# +# We assume that data from the same study may have different baseline news-watching levels +# (i.e., random intercepts). # # Do age and education predict TV news watching? -model = sm.MixedLM.from_formula("TVnews ~ age + educ", data, groups=data["site"]) +model = sm.MixedLM.from_formula("TVnews ~ age + educ", data, groups=data["study"]) fitted_model = model.fit() print(fitted_model.summary()) ############################################################################### # Create the meta-analysis dataset # ----------------------------------- -import pandas as pd - +# We assume that each study performed their own analyses testing the same hypothesis. +# The hypothesis, in this case, is "to what extent do age and education predict TV news watching"? +# +# The model used will, for our example, be exactly the same across studies. +# This model is just a GLM with age, education, and an intercept predicting the number of hours. +# Individual studies would then report the parameter estimate and variance for each term in the +# model. data["intercept"] = 1 target_vars = ["educ", "age", "intercept"] -# calculate mean and variance for each variance of interest +# calculate coefficient and variance for each variable of interest meta_df = [] -for site_name, site_df in data.groupby("site"): - model = sm.OLS(site_df["TVnews"], site_df[target_vars]) +for study_name, study_df in data.groupby("study"): + model = sm.OLS(study_df["TVnews"], study_df[target_vars]) fitted_model = model.fit() # extract parameter estimates and errors as Series @@ -75,12 +138,12 @@ # combine Series and convert to DataFrame coefficients = coefficients.append(sampling_variances) - coefficients["site"] = site_name - coefficients["sample_size"] = site_df.shape[0] + coefficients["study"] = study_name + coefficients["sample_size"] = study_df.shape[0] temp_df = pd.DataFrame(coefficients).T meta_df.append(temp_df) -# Combine across sites and convert objects to floats +# Combine across studies and convert objects to floats meta_df = pd.concat(meta_df).reset_index(drop=True) meta_df = meta_df.convert_dtypes() meta_df @@ -89,9 +152,6 @@ # The meta-analysis # -------------------------------------------- # Are age and education significant predictors of TV news watching across the literature? -from pymare import Dataset -from pymare.estimators import DerSimonianLaird - metamodel = DerSimonianLaird() dset = Dataset( y=meta_df[["age", "educ"]].values.astype(float), @@ -101,8 +161,6 @@ metamodel.fit_dataset(dset) ############################################################################### -from pprint import pprint - summary = metamodel.summary() results = summary.get_fe_stats() pprint(results) From d2081c5509624e29f4577625067d37145b75c2fc Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Sun, 19 Dec 2021 15:01:11 -0500 Subject: [PATCH 7/7] Remove tabulate. --- setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index d6e2790..65e0d91 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,7 +60,6 @@ doc = sphinx_gallery==0.10.1 sphinx_rtd_theme statsmodels - tabulate tests = codecov coverage