Skip to content

Commit

Permalink
Merge pull request #12 from LIVVkit/mkstratos/mvk-threshold
Browse files Browse the repository at this point in the history
Implement FDR correction for MVK tests
  • Loading branch information
mkstratos authored Dec 4, 2023
2 parents 15693ca + db8aaed commit b0d023d
Show file tree
Hide file tree
Showing 8 changed files with 488 additions and 216 deletions.
4 changes: 2 additions & 2 deletions evv4esm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding=utf-8
# Copyright (c) 2018-2022 UT-BATTELLE, LLC
# Copyright (c) 2018-2023 UT-BATTELLE, LLC
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -28,7 +28,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


__version_info__ = (0, 4, 0)
__version_info__ = (0, 5, 0)
__version__ = '.'.join(str(vi) for vi in __version_info__)

PASS_COLOR = '#389933'
Expand Down
3 changes: 1 addition & 2 deletions evv4esm/ensembles/e3sm.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ def component_monthly_files(dir_, component, ninst, hist_name="h0", nmonth_max=1
else:
date_search = "????-??"

def component_monthly_files(dir_, component, ninst, hist_name="hist", nmonth_max=24, date_style="short"):
base = "{d}/*{c}_????.{n}.????-??-??.nc".format(d=dir_, c=component, n=hist_name)
base = "{d}/*{c}_????.{n}.{ds}.nc".format(d=dir_, c=component, n=hist_name, ds=date_search)
search = os.path.normpath(base)
result = sorted(glob.glob(search))

Expand Down
221 changes: 166 additions & 55 deletions evv4esm/ensembles/tools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright (c) 2018 UT-BATTELLE, LLC
# Copyright (c) 2018-2023 UT-BATTELLE, LLC
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -29,19 +29,23 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""General tools for working with ensembles."""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

from evv4esm import pf_color_picker, light_pf_color_picker

from evv4esm import pf_color_picker

def monthly_to_annual_avg(var_data, cal='ignore'):
def monthly_to_annual_avg(var_data, cal="ignore"):
if len(var_data) != 12:
raise ValueError('Error! There are 12 months in a year; '
'you passed in {} monthly averages.'.format(len(var_data)))
raise ValueError(
"Error! There are 12 months in a year; "
"you passed in {} monthly averages.".format(len(var_data))
)

if cal == 'ignore':
if cal == "ignore":
# weight each month equally
avg = np.average(var_data)
else:
Expand All @@ -50,81 +54,142 @@ def monthly_to_annual_avg(var_data, cal='ignore'):
return avg


def prob_plot(test, ref, n_q, img_file, test_name='Test', ref_name='Ref.',
thing='annual global averages', pf=None):
# NOTE: Following the methods described in
# https://stackoverflow.com/questions/43285752
# to create the Q-Q and P-P plots
def prob_plot(
test,
ref,
n_q,
img_file,
test_name="Test",
ref_name="Ref.",
thing="annual global averages",
pf=None,
combine_hist=False,
):
q = np.linspace(0, 100, n_q + 1)
all_ = np.concatenate((test, ref))
min_ = np.min(all_)
max_ = np.max(all_)

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(10, 10))
plt.rc('font', family='serif')
axes = [ax1, ax2, ax3, ax4]

ax1.set_title('Q-Q Plot')
ax1.set_xlabel('{} pdf'.format(ref_name))
ax1.set_ylabel('{} pdf'.format(test_name))
_var = os.path.split(img_file)[-1].split(".")[0]
fig.suptitle(_var)

# NOTE: Axis switched here from Q-Q plot because cdf reflects about the 1-1 line
ax2.set_title('P-P Plot')
ax2.set_xlabel('{} cdf'.format(test_name))
ax2.set_ylabel('{} cdf'.format(ref_name))
plt.rc("font", family="serif")

ax3.set_title('{} pdf'.format(ref_name))
ax3.set_xlabel('Unity-based normalization of {}'.format(thing))
ax3.set_ylabel('Frequency')
ax1.set_title("Q-Q Plot")
ax1.set_xlabel("{} pdf".format(ref_name))
ax1.set_ylabel("{} pdf".format(test_name))

ax4.set_title('{} pdf'.format(test_name))
ax4.set_xlabel('Unity-based normalization of {}'.format(thing))
ax4.set_ylabel('Frequency')
# NOTE: Axis switched here from Q-Q plot because cdf reflects about the 1-1 line
ax2.set_title("P-P Plot")
ax2.set_xlabel("{} cdf".format(test_name))
ax2.set_ylabel("{} cdf".format(ref_name))

norm_rng = [0.0, 1.0]
ax1.plot(norm_rng, norm_rng, 'gray', zorder=1)
ax1.plot(norm_rng, norm_rng, "gray", zorder=1)
ax1.set_xlim(tuple(norm_rng))
ax1.set_ylim(tuple(norm_rng))
ax1.autoscale()

ax2.plot(norm_rng, norm_rng, 'gray', zorder=1)
ax2.plot(norm_rng, norm_rng, "gray", zorder=1)
ax2.set_xlim(tuple(norm_rng))
ax2.set_ylim(tuple(norm_rng))
ax2.autoscale()

if combine_hist:
ax3.set_title("Ensemble histogram")
ax4.set_title("Ensemble CDF")
else:
ax3.set_title("{} pdf".format(ref_name))

ax4.set_title("{} pdf".format(test_name))
ax4.set_xlabel("Unity-based normalization of {}".format(thing))
ax4.set_ylabel("Frequency")
ax4.set_xlim(tuple(norm_rng))
ax4.autoscale()

ax3.set_ylabel("Frequency")
ax3.set_xlabel("Unity-based normalization of {}".format(thing))

ax3.set_xlim(tuple(norm_rng))
ax3.autoscale()

ax4.set_xlim(tuple(norm_rng))
ax4.autoscale()
ax4.set_ylabel("N Ensemble members")
ax4.set_xlabel("Unity-based normalization of {}".format(thing))

# NOTE: Produce unity-based normalization of data for the Q-Q plots because
# matplotlib can't handle small absolute values or data ranges. See
# https://github.com/matplotlib/matplotlib/issues/6015
if not np.allclose(min_, max_, rtol=np.finfo(max_).eps):
norm1 = (ref - min_) / (max_ - min_)
norm2 = (test - min_) / (max_ - min_)

ax1.scatter(np.percentile(norm1, q), np.percentile(norm2, q),
color=pf_color_picker.get(pf, '#1F77B4'), zorder=2)
ax3.hist(norm1, bins=n_q, color=pf_color_picker.get(pf, '#1F77B4'), edgecolor="k")
ax4.hist(norm2, bins=n_q, color=pf_color_picker.get(pf, '#1F77B4'), edgecolor="k")

# Check if these distributions are wildly different. If they are, use different
# colours for the bottom axis? Otherwise set the scales to be the same [0, 1]
if abs(norm1.mean() - norm2.mean()) >= 0.5:
ax3.tick_params(axis="x", colors="C0")
ax3.spines["bottom"].set_color("C0")

ax4.tick_params(axis="x", colors="C1")
ax4.spines["bottom"].set_color("C1")
else:
ax3.set_xlim(tuple(norm_rng))
bnds = np.linspace(min_, max_, n_q)
if not np.allclose(
bnds, bnds[0], rtol=np.finfo(bnds[0]).eps, atol=np.finfo(bnds[0]).eps
):
norm_ref = (ref - min_) / (max_ - min_)
norm_test = (test - min_) / (max_ - min_)

# Create P-P plot
ax1.scatter(
np.percentile(norm_ref, q),
np.percentile(norm_test, q),
color=pf_color_picker.get(pf, "#1F77B4"),
zorder=2,
)
if combine_hist:
# Plot joint histogram (groups test / ref side-by-side for each bin)
freq, bins, _ = ax3.hist(
[norm_ref, norm_test],
bins=n_q,
edgecolor="k",
label=[ref_name, test_name],
color=[
pf_color_picker.get(pf, "#1F77B4"),
light_pf_color_picker.get(pf, "#B55D1F"),
],
zorder=5,
)
ax3.legend()

cdf = freq.cumsum(axis=1)

ax4.plot(
bins,
[0, *cdf[0]],
color=pf_color_picker.get(pf, "#1F77B4"),
label=ref_name,
)
ax4.plot(
bins,
[0, *cdf[1]],
color=light_pf_color_picker.get(pf, "#B55D1F"),
label=test_name,
)
ax4.set_xlim(tuple(norm_rng))
ax4.legend()

else:
ax3.hist(
norm_ref, bins=n_q, color=pf_color_picker.get(pf, "#1F77B4"), edgecolor="k"
)
ax4.hist(
norm_test, bins=n_q, color=pf_color_picker.get(pf, "#1F77B4"), edgecolor="k"
)

# bin both series into equal bins and get cumulative counts for each bin
bnds = np.linspace(min_, max_, n_q)
if not np.allclose(bnds, bnds[0], rtol=np.finfo(bnds[0]).eps):
# Check if these distributions are wildly different. If they are, use different
# colours for the bottom axis? Otherwise set the scales to be the same [0, 1]
if abs(norm_ref.mean() - norm_test.mean()) >= 0.5:
ax3.tick_params(axis="x", colors="C0")
ax3.spines["bottom"].set_color("C0")

ax4.tick_params(axis="x", colors="C1")
ax4.spines["bottom"].set_color("C1")
else:
ax4.set_xlim(tuple(norm_rng))

ax3.set_xlim(tuple(norm_rng))

# bin both series into equal bins and get cumulative counts for each bin
ppxb = pd.cut(ref, bnds)
ppyb = pd.cut(test, bnds)

Expand All @@ -134,12 +199,58 @@ def prob_plot(test, ref, n_q, img_file, test_name='Test', ref_name='Ref.',
ppxh = np.cumsum(ppxh)
ppyh = np.cumsum(ppyh)

ax2.scatter(ppyh.values, ppxh.values,
color=pf_color_picker.get(pf, '#1F77B4'), zorder=2)
ax2.scatter(
ppyh.values, ppxh.values, color=pf_color_picker.get(pf, "#1F77B4"), zorder=2
)
else:
# Define a text box if the data are not plottable
const_axis_text = {
"x": 0.5,
"y": 0.5,
"s": f"CONSTANT FIELD\nMIN: {min_:.4e}\nMAX: {max_:.4e}",
"horizontalalignment": "center",
"verticalalignment": "center",
"backgroundcolor": ax1.get_facecolor(),
}
ax1.text(**const_axis_text)
ax2.text(**const_axis_text)
if combine_hist:
ax3.hist(
[test, ref],
bins=n_q,
edgecolor="k",
label=[test_name, ref_name],
color=[
pf_color_picker.get(pf, "#1F77B4"),
light_pf_color_picker.get(pf, "#B55D1F"),
],
zorder=5,
)
ax3.legend()
ax4.legend()
else:
ax3.hist(
test,
bins=n_q,
edgecolor="k",
color=pf_color_picker.get(pf, "#1F77B4"),
zorder=5,
)
ax4.hist(
ref,
bins=n_q,
edgecolor="k",
label=[test_name, ref_name],
color=pf_color_picker.get(pf, "#1F77B4"),
zorder=5,
)

for axis in axes:
axis.grid(visible=True, ls="--", lw=0.5, zorder=-1)

plt.tight_layout()
plt.savefig(img_file, bbox_inches='tight')

plt.savefig(img_file, bbox_inches="tight")

plt.close(fig)

Expand Down
Loading

0 comments on commit b0d023d

Please sign in to comment.