From 8d481964a2a31272fb596690f5b3d16df4448831 Mon Sep 17 00:00:00 2001
From: Taylor Salo <tsalo006@fiu.edu>
Date: Sat, 18 Dec 2021 14:56:47 -0500
Subject: [PATCH 1/7] Add mega-analysis example.

---
 .../plot_compare_mega_and_meta-analysis.py    | 106 ++++++++++++++++++
 setup.cfg                                     |   1 +
 2 files changed, 107 insertions(+)
 create mode 100644 examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py

diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
new file mode 100644
index 0000000..5102f06
--- /dev/null
+++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
@@ -0,0 +1,106 @@
+# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+# ex: set sts=4 ts=4 sw=4 et:
+"""
+.. _meta3:
+
+================================================
+ Run mega- and meta-analysis
+================================================
+
+Steps:
+
+1.  Make a toy dataset
+1.  Run mega-analysis (linear mixed effects model with random intercepts for site)
+1.  Group dataset by site and run OLS on each site separately to construct derived toy
+    meta-analysis dataset
+1.  Run meta-analysis with DerSimonian-Laird between-study variance estimator
+"""
+import numpy as np
+import statsmodels.api as sm
+
+###############################################################################
+# Wrangle some example data
+# -----------------------------------------------------------------------------
+data = sm.datasets.anes96.load_pandas().data
+data.head()
+
+###############################################################################
+dat = data["popul"]
+bins = np.linspace(0, np.max(dat) + 1, 20)
+digitized = np.digitize(dat, bins)
+idx = {i: np.where(digitized == i)[0] for i in range(1, len(bins))}
+idx = {k: v for k, v in idx.items() if v.size}
+
+# Assign "site" based on grouped populations
+data["site"] = 0
+i = 0
+letters = ["a", "b", "c", "d", "e", "f", "g", "h"]
+for k, v in idx.items():
+    data.loc[v, "site"] = letters[i]
+    i += 1
+
+data.head()
+
+###############################################################################
+# The mega-analysis
+# -----------------------------------------------------------------------------
+# Random intercepts model using site
+#
+# Do age and education predict TV news watching?
+model = sm.MixedLM.from_formula("TVnews ~ age + educ", data, groups=data["site"])
+fitted_model = model.fit()
+print(fitted_model.summary())
+
+###############################################################################
+# Create the meta-analysis dataset
+# -----------------------------------
+import pandas as pd
+
+data["intercept"] = 1
+target_vars = ["educ", "age", "intercept"]
+
+# calculate mean and variance for each variance of interest
+meta_df = []
+for site_name, site_df in data.groupby("site"):
+    model = sm.OLS(site_df["TVnews"], site_df[target_vars])
+    fitted_model = model.fit()
+
+    # extract parameter estimates and errors as Series
+    coefficients = fitted_model.params
+    std_errors = fitted_model.bse
+
+    # convert standard error to sampling variance
+    sampling_variances = std_errors ** 2
+    names = [n + "_var" for n in sampling_variances.index.tolist()]
+    sampling_variances.index = names
+
+    # combine Series and convert to DataFrame
+    coefficients = coefficients.append(sampling_variances)
+    coefficients["site"] = site_name
+    coefficients["sample_size"] = site_df.shape[0]
+    temp_df = pd.DataFrame(coefficients).T
+    meta_df.append(temp_df)
+# Combine across sites and convert objects to floats
+meta_df = pd.concat(meta_df).reset_index(drop=True)
+meta_df = meta_df.convert_dtypes()
+print(meta_df.to_markdown())
+
+from pymare import Dataset
+
+###############################################################################
+# The meta-analysis
+# --------------------------------------------
+# Are age and education significant predictors of TV news watching across the literature?
+from pymare.estimators import DerSimonianLaird
+
+metamodel = DerSimonianLaird()
+dset = Dataset(
+    y=meta_df[["age", "educ"]].values,
+    v=meta_df[["age_var", "educ_var"]].values,
+    add_intercept=True,
+)
+metamodel.fit_dataset(dset)
+
+###############################################################################
+summary = metamodel.summary()
+summary.get_fe_stats()
diff --git a/setup.cfg b/setup.cfg
index 5d281de..65e0d91 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -59,6 +59,7 @@ doc =
     sphinx-copybutton
     sphinx_gallery==0.10.1
     sphinx_rtd_theme
+    statsmodels
 tests =
     codecov
     coverage

From 3159975e8b10317348d653af37ae80dceb68f4d2 Mon Sep 17 00:00:00 2001
From: Taylor Salo <tsalo006@fiu.edu>
Date: Sat, 18 Dec 2021 15:00:39 -0500
Subject: [PATCH 2/7] Update setup.cfg

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 65e0d91..d6e2790 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -60,6 +60,7 @@ doc =
     sphinx_gallery==0.10.1
     sphinx_rtd_theme
     statsmodels
+    tabulate
 tests =
     codecov
     coverage

From 0cc3330e63b80ae9072e26997f16eaef7cd3bb3c Mon Sep 17 00:00:00 2001
From: Taylor Salo <tsalo006@fiu.edu>
Date: Sat, 18 Dec 2021 15:06:44 -0500
Subject: [PATCH 3/7] Update plot_compare_mega_and_meta-analysis.py

---
 .../plot_compare_mega_and_meta-analysis.py                | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
index 5102f06..20cccf3 100644
--- a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
+++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
@@ -11,8 +11,7 @@
 
 1.  Make a toy dataset
 1.  Run mega-analysis (linear mixed effects model with random intercepts for site)
-1.  Group dataset by site and run OLS on each site separately to construct derived toy
-    meta-analysis dataset
+1.  Group dataset by site and run OLS on each site separately to construct meta-analysis dataset
 1.  Run meta-analysis with DerSimonian-Laird between-study variance estimator
 """
 import numpy as np
@@ -80,6 +79,7 @@
     coefficients["sample_size"] = site_df.shape[0]
     temp_df = pd.DataFrame(coefficients).T
     meta_df.append(temp_df)
+
 # Combine across sites and convert objects to floats
 meta_df = pd.concat(meta_df).reset_index(drop=True)
 meta_df = meta_df.convert_dtypes()
@@ -95,8 +95,8 @@
 
 metamodel = DerSimonianLaird()
 dset = Dataset(
-    y=meta_df[["age", "educ"]].values,
-    v=meta_df[["age_var", "educ_var"]].values,
+    y=meta_df[["age", "educ"]].values.astype(float),
+    v=meta_df[["age_var", "educ_var"]].values.astype(float),
     add_intercept=True,
 )
 metamodel.fit_dataset(dset)

From 3f4cafc8d744467462cdcf38f91eb77156aa7c67 Mon Sep 17 00:00:00 2001
From: Taylor Salo <tsalo006@fiu.edu>
Date: Sat, 18 Dec 2021 15:09:24 -0500
Subject: [PATCH 4/7] Fix things.

---
 .../plot_compare_mega_and_meta-analysis.py               | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
index 20cccf3..dddbde2 100644
--- a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
+++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
@@ -10,9 +10,9 @@
 Steps:
 
 1.  Make a toy dataset
-1.  Run mega-analysis (linear mixed effects model with random intercepts for site)
-1.  Group dataset by site and run OLS on each site separately to construct meta-analysis dataset
-1.  Run meta-analysis with DerSimonian-Laird between-study variance estimator
+2.  Run mega-analysis (linear mixed effects model with random intercepts for site)
+3.  Group dataset by site and run OLS on each site separately to construct meta-analysis dataset
+4.  Run meta-analysis with DerSimonian-Laird between-study variance estimator
 """
 import numpy as np
 import statsmodels.api as sm
@@ -85,12 +85,11 @@
 meta_df = meta_df.convert_dtypes()
 print(meta_df.to_markdown())
 
-from pymare import Dataset
-
 ###############################################################################
 # The meta-analysis
 # --------------------------------------------
 # Are age and education significant predictors of TV news watching across the literature?
+from pymare import Dataset
 from pymare.estimators import DerSimonianLaird
 
 metamodel = DerSimonianLaird()

From 6e1f584fb3465d323d8a25af5417eb0e64af7616 Mon Sep 17 00:00:00 2001
From: Taylor Salo <tsalo006@fiu.edu>
Date: Sat, 18 Dec 2021 15:10:22 -0500
Subject: [PATCH 5/7] Update plot_compare_mega_and_meta-analysis.py

---
 .../plot_compare_mega_and_meta-analysis.py                 | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
index dddbde2..ce77fca 100644
--- a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
+++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
@@ -83,7 +83,7 @@
 # Combine across sites and convert objects to floats
 meta_df = pd.concat(meta_df).reset_index(drop=True)
 meta_df = meta_df.convert_dtypes()
-print(meta_df.to_markdown())
+meta_df
 
 ###############################################################################
 # The meta-analysis
@@ -101,5 +101,8 @@
 metamodel.fit_dataset(dset)
 
 ###############################################################################
+from pprint import pprint
+
 summary = metamodel.summary()
-summary.get_fe_stats()
+results = summary.get_fe_stats()
+pprint(results)

From 692f4df59702c3a4113cfa51fc8eded807df8561 Mon Sep 17 00:00:00 2001
From: Taylor Salo <tsalo006@fiu.edu>
Date: Sun, 19 Dec 2021 14:47:13 -0500
Subject: [PATCH 6/7] Improve example. Still holding off on random slopes.

---
 .../plot_compare_mega_and_meta-analysis.py    | 124 +++++++++++++-----
 1 file changed, 91 insertions(+), 33 deletions(-)

diff --git a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
index ce77fca..56a08b0 100644
--- a/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
+++ b/examples/02_meta-analysis/plot_compare_mega_and_meta-analysis.py
@@ -10,58 +10,121 @@
 Steps:
 
 1.  Make a toy dataset
-2.  Run mega-analysis (linear mixed effects model with random intercepts for site)
-3.  Group dataset by site and run OLS on each site separately to construct meta-analysis dataset
+2.  Run mega-analysis (linear mixed effects model with random intercepts for study)
+3.  Group dataset by study and run OLS on each study separately to construct meta-analysis dataset
 4.  Run meta-analysis with DerSimonian-Laird between-study variance estimator
 """
+from pprint import pprint
+
+import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
+import seaborn as sns
 import statsmodels.api as sm
 
+from pymare import Dataset
+from pymare.estimators import DerSimonianLaird
+
+sns.set_theme(style="ticks")
+
 ###############################################################################
 # Wrangle some example data
 # -----------------------------------------------------------------------------
+# For this example, we will use the "American National Election Survey 1996"
+# dataset from statsmodels.
+# This dataset includes several elements we care about, including:
+#
+# 1. A large number of observations (944).
+# 2. A variable that can be used as a substitute for "study".
+#    In this case, we will pretend that all observations with similar
+#    populations come from the same cities, which we will in turn pretend
+#    reflect different studies of the same phenomenon by different research
+#    groups.
 data = sm.datasets.anes96.load_pandas().data
-data.head()
+data.head(10)
+
+###############################################################################
+# Convert the populations to "studies" by binning similar populations and
+# assigning them to the same "study" values.
+
+# I selected the first study with a first author starting with each letter from Neurosynth.
+# I figured it would give the dataset some verisimilitude.
+study_names = [
+    "Aarts & Roelofs (2011)",
+    "Baas et al. (2008)",
+    "Cabanis et al. (2013)",
+    "D'Agata et al. (2011)",
+    "Eack et al. (2008)",
+    "Fabbri, Caramazza, & Lingnau (2012)",
+    "Gaab, Gabrieli, & Glover (2007)",
+    "Haag et al. (2014)",
+]
+
+data = data.sort_values(by="popul")
+group_assignments = []
+split_assignments = np.array_split(np.arange(data.shape[0], dtype=int), 8)
+for i_split, split in enumerate(split_assignments):
+    group_assignments += [study_names[i_split]] * len(split)
+
+data["study"] = group_assignments
+data.head(10)
+
+###############################################################################
+fig, ax = plt.subplots(figsize=(16, 8))
+sns.stripplot(data=data, x="logpopul", y="study", ax=ax)
+fig.tight_layout()
+fig.show()
 
 ###############################################################################
-dat = data["popul"]
-bins = np.linspace(0, np.max(dat) + 1, 20)
-digitized = np.digitize(dat, bins)
-idx = {i: np.where(digitized == i)[0] for i in range(1, len(bins))}
-idx = {k: v for k, v in idx.items() if v.size}
-
-# Assign "site" based on grouped populations
-data["site"] = 0
-i = 0
-letters = ["a", "b", "c", "d", "e", "f", "g", "h"]
-for k, v in idx.items():
-    data.loc[v, "site"] = letters[i]
-    i += 1
-
-data.head()
+# The variables of interest
+# -----------------------------------------------------------------------------
+# First, let's talk about the variables of interest in this analysis.
+# The variables are:
+#
+# 1. TVnews: The number of times, per week, that the respondent watches the news on TV.
+# 2. age: The age of the respondent, in years.
+# 3. educ: The education of the respondent, binned into the following groups:
+#    (1) Grade 1-8, (2) Some high school, (3) High school graduate, (4) Some college,
+#    (5) College graduate, (6) Master's degree, and (7) PhD.
+#    We will evaluate linear relationships with this coded variable, which isn't
+#    optimal, but this is just an example, so... ¯\\_(ツ)_/¯
+#
+# If we visualize the distributions of the different variables, we'll see that
+# they're not exactly normally distributed...
+sns.pairplot(data[["study", "TVnews", "age", "educ"]], hue="study")
 
 ###############################################################################
 # The mega-analysis
 # -----------------------------------------------------------------------------
-# Random intercepts model using site
+# Random intercepts model using study.
+# In this model, we test the question, "to what extent do age and education
+# predict TV news watching?"
+#
+# We assume that data from the same study may have different baseline news-watching levels
+# (i.e., random intercepts).
 #
 # Do age and education predict TV news watching?
-model = sm.MixedLM.from_formula("TVnews ~ age + educ", data, groups=data["site"])
+model = sm.MixedLM.from_formula("TVnews ~ age + educ", data, groups=data["study"])
 fitted_model = model.fit()
 print(fitted_model.summary())
 
 ###############################################################################
 # Create the meta-analysis dataset
 # -----------------------------------
-import pandas as pd
-
+# We assume that each study performed their own analyses testing the same hypothesis.
+# The hypothesis, in this case, is "to what extent do age and education predict TV news watching"?
+#
+# The model used will, for our example, be exactly the same across studies.
+# This model is just a GLM with age, education, and an intercept predicting the number of hours.
+# Individual studies would then report the parameter estimate and variance for each term in the
+# model.
 data["intercept"] = 1
 target_vars = ["educ", "age", "intercept"]
 
-# calculate mean and variance for each variance of interest
+# calculate coefficient and variance for each variable of interest
 meta_df = []
-for site_name, site_df in data.groupby("site"):
-    model = sm.OLS(site_df["TVnews"], site_df[target_vars])
+for study_name, study_df in data.groupby("study"):
+    model = sm.OLS(study_df["TVnews"], study_df[target_vars])
     fitted_model = model.fit()
 
     # extract parameter estimates and errors as Series
@@ -75,12 +138,12 @@
 
     # combine Series and convert to DataFrame
     coefficients = coefficients.append(sampling_variances)
-    coefficients["site"] = site_name
-    coefficients["sample_size"] = site_df.shape[0]
+    coefficients["study"] = study_name
+    coefficients["sample_size"] = study_df.shape[0]
     temp_df = pd.DataFrame(coefficients).T
     meta_df.append(temp_df)
 
-# Combine across sites and convert objects to floats
+# Combine across studies and convert objects to floats
 meta_df = pd.concat(meta_df).reset_index(drop=True)
 meta_df = meta_df.convert_dtypes()
 meta_df
@@ -89,9 +152,6 @@
 # The meta-analysis
 # --------------------------------------------
 # Are age and education significant predictors of TV news watching across the literature?
-from pymare import Dataset
-from pymare.estimators import DerSimonianLaird
-
 metamodel = DerSimonianLaird()
 dset = Dataset(
     y=meta_df[["age", "educ"]].values.astype(float),
@@ -101,8 +161,6 @@
 metamodel.fit_dataset(dset)
 
 ###############################################################################
-from pprint import pprint
-
 summary = metamodel.summary()
 results = summary.get_fe_stats()
 pprint(results)

From d2081c5509624e29f4577625067d37145b75c2fc Mon Sep 17 00:00:00 2001
From: Taylor Salo <tsalo006@fiu.edu>
Date: Sun, 19 Dec 2021 15:01:11 -0500
Subject: [PATCH 7/7] Remove tabulate.

---
 setup.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index d6e2790..65e0d91 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -60,7 +60,6 @@ doc =
     sphinx_gallery==0.10.1
     sphinx_rtd_theme
     statsmodels
-    tabulate
 tests =
     codecov
     coverage