Reformatting

TheJacksonLaboratory · Jul 30, 2020 · 28e7f02 · 28e7f02
1 parent 07ea8cf
commit 28e7f02
Show file tree

Hide file tree

Showing 5 changed files with 216 additions and 173 deletions.
diff --git a/post_processing/gr50.py b/post_processing/gr50.py
@@ -33,55 +33,63 @@ def compute_gr50(data, t0_col, t1_col, groupby="Drug", control="DMSO"):
     prepared["log2_case"] = np.log2(prepared.t1 / prepared.t0)
     prepared["log2_ctrl"] = np.log2(prepared.control / prepared.t0)
     prepared["log2_ratio"] = prepared.log2_case / prepared.log2_ctrl
-    prepared["gr_value"] = 2**prepared.log2_ratio - 1
+    prepared["gr_value"] = 2 ** prepared.log2_ratio - 1
     return prepared.sort_values(["concentration", groupby]).reset_index()
 
 
-def plot_grs(intermediate, final, groupby="Drug", point_col="gr_value", line_col="GR50"):
+def plot_grs(
+    intermediate, final, groupby="Drug", point_col="gr_value", line_col="GR50"
+):
     fig, ax = plt.subplots(dpi=300)
 
     xmin = intermediate["concentration"].min() * 0.7
     xmax = intermediate["concentration"].max() * 2
     xs = np.logspace(np.log10(xmin), np.log10(xmax), 250)
 
-    params = {
-        "GR": ["GRinf", "GEC50", "h_GR"],
-        "IC": ["Einf", "EC50", "h"],
-    }[line_col[:2].upper()]
+    params = {"GR": ["GRinf", "GEC50", "h_GR"], "IC": ["Einf", "EC50", "h"],}[
+        line_col[:2].upper()
+    ]
 
     final = final.sort_values(line_col)
-    scatter_params = dict(s=20, lw=0.5, alpha=0.5, marker='o', edgecolor="none")
+    scatter_params = dict(s=20, lw=0.5, alpha=0.5, marker="o", edgecolor="none")
     palette = sns.mpl_palette("tab20", len(final))
     jitter = np.linspace(0.8, 1.2, len(final))
     for (_, row), color, jit in zip(final.iterrows(), palette, jitter):
         grouper = row[groupby]
-        inter = intermediate.loc[intermediate[groupby].isin([grouper]), ["concentration", point_col]]
-
-        ax.scatter(inter["concentration"]*jit, inter[point_col], color=color, **scatter_params)
+        inter = intermediate.loc[
+            intermediate[groupby].isin([grouper]), ["concentration", point_col]
+        ]
+
+        ax.scatter(
+            inter["concentration"] * jit,
+            inter[point_col],
+            color=color,
+            **scatter_params,
+        )
 
-        #inter["rep"] = inter.groupby("concentration").cumcount()
-        #err_data = inter.pivot(index="concentration", columns="rep")
-        #ym = err_data.mean(axis=1)
-        #err = np.vstack((err_data.std(axis=1),)*2)
-        #ax.errorbar(err_data.index * jit, ym, yerr=err, marker="", linestyle="", capsize=2, lw=0.5, color=color)
+        # inter["rep"] = inter.groupby("concentration").cumcount()
+        # err_data = inter.pivot(index="concentration", columns="rep")
+        # ym = err_data.mean(axis=1)
+        # err = np.vstack((err_data.std(axis=1),)*2)
+        # ax.errorbar(err_data.index * jit, ym, yerr=err, marker="", linestyle="", capsize=2, lw=0.5, color=color)
 
         logistic_params = row[params]
         logistic_params[1] = np.nan_to_num(np.log10(logistic_params[1]))
         ys = logistic(xs, logistic_params)
         ax.plot(xs, ys, label=grouper, color=color, lw=2)
-        x50 = 10**logistic_params[1]
+        x50 = 10 ** logistic_params[1]
         z50 = logistic(x50, logistic_params)
-        ax.scatter([x50], [z50], color=color, marker="o", edgecolor='k', zorder=10)
+        ax.scatter([x50], [z50], color=color, marker="o", edgecolor="k", zorder=10)
 
     ax.set_title(line_col)
     ax.set_xlabel("Concentration")
     ax.set_ylabel(point_col)
     ax.set_xscale("log")
     ax.set_xlim(xmin, xmax)
-    leg = ax.legend(bbox_to_anchor=(1,0.5), frameon=False, loc="center left")
+    leg = ax.legend(bbox_to_anchor=(1, 0.5), frameon=False, loc="center left")
     for t in leg.texts:
         text = t.get_text().replace("+", " + ")
-        t.set_text('\n'.join(textwrap.wrap(text, 15)))
+        t.set_text("\n".join(textwrap.wrap(text, 15)))
     sns.despine(fig, ax)
 
     fig.tight_layout()
@@ -127,20 +135,22 @@ def parse_args():
     )
 
     parser.add_argument(
-        "-g0", "--gr50-t0",
+        "-g0",
+        "--gr50-t0",
         dest="gr50_timepoint_0",
         required=True,
         help=(
             "Specify measurement column to use as t_0 in GR50 calculation. "
             "This can be the full column name or a regex; e.g.  'phenix-day1.*Sum.*'"
-        )
+        ),
     )
 
     parser.add_argument(
-        "-g1", "--gr50-t1",
+        "-g1",
+        "--gr50-t1",
         dest="gr50_timepoint_1",
         required=True,
-        help="Same specification for '-g1'. Must specify both g0 and g1."
+        help="Same specification for '-g1'. Must specify both g0 and g1.",
     )
 
     parser.add_argument(
@@ -152,14 +162,11 @@ def parse_args():
     )
 
     parser.add_argument(
-        "-c",
-        dest="control_value",
-        default="DMSO",
+        "-c", dest="control_value", default="DMSO",
     )
 
     parser.add_argument(
-        "-v", "--verbose", action="store_true",
-        help="Print extra information"
+        "-v", "--verbose", action="store_true", help="Print extra information"
     )
 
     return parser.parse_args()
@@ -180,36 +187,30 @@ def parse_args():
 
     starting = pd.read_csv(args.hcs_file, index_col=0)
     t0_col, t1_col = find_columns(
-        starting.columns,
-        args.gr50_timepoint_0,
-        args.gr50_timepoint_1,
+        starting.columns, args.gr50_timepoint_0, args.gr50_timepoint_1,
     )
 
     intermediate = compute_gr50(
-        starting, t0_col, t1_col,
-        groupby="Drug",
-        control=args.control_value
+        starting, t0_col, t1_col, groupby="Drug", control=args.control_value
     )
 
     finalized = gr_metrics(
         intermediate,
         alpha=0.05,
         gr_value="gr_value",
         ic_value="relative_count",
-        keys=["Drug"]
+        keys=["Drug"],
     )
 
     gr50_fig = plot_grs(
-        intermediate,
-        finalized, groupby="Drug",
-        point_col="gr_value",
-        line_col="GR50"
+        intermediate, finalized, groupby="Drug", point_col="gr_value", line_col="GR50"
     )
     ic50_fig = plot_grs(
         intermediate,
-        finalized, groupby="Drug",
+        finalized,
+        groupby="Drug",
         point_col="relative_count",
-        line_col="IC50"
+        line_col="IC50",
     )
 
     intermediate_out = f"{args.output_prefix}_gr50-per-well.csv"

diff --git a/post_processing/gr50_metrics.py b/post_processing/gr50_metrics.py
@@ -8,57 +8,67 @@
 import numpy as np
 import scipy.optimize, scipy.stats
 
+
 def logistic(x, params):
     einf, log10_mid, slope = params
     emin = 1.0
     mid = 10 ** log10_mid
-    return ( (emin-einf) / (1 + ((x/mid)**slope) ) ) + einf
+    return ((emin - einf) / (1 + ((x / mid) ** slope))) + einf
+
 
 def _logistic_inv(y, params):
     einf, log10_mid, slope = params
     emin = 1.0
     mid = 10 ** log10_mid
     if y >= min(emin, einf) and y <= max(emin, einf):
-        return mid * ( (y-emin) / (einf-y) ) ** (1/slope)
+        return mid * ((y - emin) / (einf - y)) ** (1 / slope)
     else:
         return np.inf
 
+
 def _flat(x, params):
-    y, = params
+    (y,) = params
     return y
 
+
 def _rss(params, fn, xdata, ydata):
     rss = 0.0
     for x, y in zip(xdata, ydata):
         rss += (y - fn(x, params)) ** 2
     return rss
 
+
 def _tss(ydata):
     tss = 0.0
     y_mean = ydata.mean()
     for y in ydata:
         tss += (y - y_mean) ** 2
     return tss
 
+
 def _rsquare(params, fn, xdata, ydata):
     ss_res = _rss(params, fn, xdata, ydata)
     ss_tot = _tss(ydata)
     return 1 - ss_res / ss_tot
 
+
 def _fit(fn, xdata, ydata, prior, bounds):
-    res = scipy.optimize.minimize(_rss, args=(fn, xdata, ydata),
-                                  x0=prior, bounds=bounds)
+    res = scipy.optimize.minimize(
+        _rss, args=(fn, xdata, ydata), x0=prior, bounds=bounds
+    )
     return res
 
+
 def _calculate_pval(logistic_result, flat_result, n):
     rss2 = logistic_result.fun
     rss1 = flat_result.fun
     df1 = len(logistic_result.x) - len(flat_result.x)
     df2 = n - len(logistic_result.x) + 1
-    f = ( (rss1-rss2)/df1 ) / (rss2/df2)
+    f = ((rss1 - rss2) / df1) / (rss2 / df2)
     pval = 1 - scipy.stats.f.cdf(f, df1, df2)
     return pval
 
+
 def _summarize(df, pval, alpha, logistic_result, flat_result, col):
     if pval > alpha or not logistic_result.success:
         if flat_result.x[0] > 0.5:
@@ -81,36 +91,62 @@ def _summarize(df, pval, alpha, logistic_result, flat_result, col):
     aoc = np.trapz(1 - df[col], log_conc) / aoc_width
     return [Z_50, max_, aoc, EZ_50, inf, slope, r2, pval]
 
-def _metrics(df, gr_alpha=0.05, gr_value='GRvalue', ic_alpha=0.05, ic_value="t1"):
-    df = df.sort_values(by='concentration')
+
+def _metrics(df, gr_alpha=0.05, gr_value="GRvalue", ic_alpha=0.05, ic_value="t1"):
+    df = df.sort_values(by="concentration")
     conc_min = df.concentration.min() / 100
     conc_max = df.concentration.max() * 100
 
     gr_bounds = np.array([[-1, 1], np.log10([conc_min, conc_max]), [0.1, 5]])
     ic_bounds = np.array([[0, 1], np.log10([conc_min, conc_max]), [0.1, 5]])
     prior = np.array([0.1, np.log10(np.median(df.concentration)), 2])
 
-    gr_logistic_result = _fit(logistic, df.concentration, df[gr_value],
-                           prior, gr_bounds)
-    gr_flat_result = _fit(_flat, df.concentration, df[gr_value],
-                       prior[[0]], gr_bounds[[0]])
+    gr_logistic_result = _fit(
+        logistic, df.concentration, df[gr_value], prior, gr_bounds
+    )
+    gr_flat_result = _fit(
+        _flat, df.concentration, df[gr_value], prior[[0]], gr_bounds[[0]]
+    )
     gr_pval = _calculate_pval(gr_logistic_result, gr_flat_result, len(df.concentration))
 
-    ic_logistic_result = _fit(logistic, df.concentration, df[ic_value],
-                           prior, ic_bounds)
-    ic_flat_result = _fit(_flat, df.concentration, df[ic_value],
-                       prior[[0]], ic_bounds[[0]])
+    ic_logistic_result = _fit(
+        logistic, df.concentration, df[ic_value], prior, ic_bounds
+    )
+    ic_flat_result = _fit(
+        _flat, df.concentration, df[ic_value], prior[[0]], ic_bounds[[0]]
+    )
     ic_pval = _calculate_pval(ic_logistic_result, ic_flat_result, len(df.concentration))
 
-    gr_data = _summarize(df, gr_pval, gr_alpha, gr_logistic_result, gr_flat_result, gr_value)
-    ic_data = _summarize(df, ic_pval, ic_alpha, ic_logistic_result, ic_flat_result, ic_value)
+    gr_data = _summarize(
+        df, gr_pval, gr_alpha, gr_logistic_result, gr_flat_result, gr_value
+    )
+    ic_data = _summarize(
+        df, ic_pval, ic_alpha, ic_logistic_result, ic_flat_result, ic_value
+    )
 
     return gr_data + ic_data
 
-def gr_metrics(data, alpha=0.05, gr_value='GRvalue', ic_value="t1", keys=["Drug"]):
+
+def gr_metrics(data, alpha=0.05, gr_value="GRvalue", ic_value="t1", keys=["Drug"]):
     gb = data.groupby(keys)
-    metric_columns = ['GR50', 'GRmax', 'GR_AOC', 'GEC50', 'GRinf', 'h_GR', 'r2_GR', 'pval_GR'] +\
-        "IC50,Emax,AUC,EC50,Einf,h,r2_rel_cell,pval_rel_cell".split(",")
+    metric_columns = [
+        "GR50",
+        "GRmax",
+        "GR_AOC",
+        "GEC50",
+        "GRinf",
+        "h_GR",
+        "r2_GR",
+        "pval_GR",
+        "IC50",
+        "Emax",
+        "AUC",
+        "EC50",
+        "Einf",
+        "h",
+        "r2_rel_cell",
+        "pval_rel_cell",
+    ]
     data = [[k] + _metrics(v, alpha, gr_value, ic_value=ic_value) for k, v in gb]
     df = pd.DataFrame(data, columns=keys + metric_columns)
     return df