Skip to content

Commit

Permalink
Initial computations of GR50s/IC50s
Browse files Browse the repository at this point in the history
  • Loading branch information
wflynny committed Jul 27, 2020
1 parent 627030b commit 52f66b8
Show file tree
Hide file tree
Showing 4 changed files with 358 additions and 3 deletions.
225 changes: 225 additions & 0 deletions post_processing/gr50.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import re
import os
import sys
import logging
import argparse
from pathlib import Path
import pandas as pd
import numpy as np

import utils
import textwrap
from gr50_metrics import gr_metrics, logistic

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns


def compute_gr50(data, t0_col, t1_col, groupby="Drug", control="DMSO"):
prepared = data[["Concentration", groupby, t0_col, t1_col]].copy()
prepared.columns = ["concentration", groupby, "t0", "t1"]
prepared.set_index("concentration", inplace=True, drop=True)
prepared.sort_index(inplace=True)

ctrl_df = prepared.loc[prepared[groupby] == control, "t1"]
ctrl_df = ctrl_df.groupby("concentration").agg("mean")
ctrl_df.name = "control"

prepared = prepared.join(ctrl_df)
prepared = prepared.loc[~prepared[groupby].isin([control]), :]

prepared["relative_count"] = prepared.t1 / prepared.control
prepared["log2_case"] = np.log2(prepared.t1 / prepared.t0)
prepared["log2_ctrl"] = np.log2(prepared.control / prepared.t0)
prepared["log2_ratio"] = prepared.log2_case / prepared.log2_ctrl
prepared["gr_value"] = 2**prepared.log2_ratio - 1
return prepared.sort_values(["concentration", groupby]).reset_index()


def plot_grs(intermediate, final, groupby="Drug", point_col="gr_value", line_col="GR50"):
fig, ax = plt.subplots(dpi=300)

xmin = intermediate["concentration"].min() * 0.7
xmax = intermediate["concentration"].max() * 2
xs = np.logspace(np.log10(xmin), np.log10(xmax), 250)

params = {
"GR": ["GRinf", "GEC50", "h_GR"],
"IC": ["Einf", "EC50", "h"],
}[line_col[:2].upper()]

final = final.sort_values(line_col)
scatter_params = dict(s=20, lw=0.5, alpha=0.5, marker='o', edgecolor="none")
palette = sns.mpl_palette("tab20", len(final))
jitter = np.linspace(0.8, 1.2, len(final))
for (_, row), color, jit in zip(final.iterrows(), palette, jitter):
grouper = row[groupby]
inter = intermediate.loc[intermediate[groupby].isin([grouper]), ["concentration", point_col]]

ax.scatter(inter["concentration"]*jit, inter[point_col], color=color, **scatter_params)

#inter["rep"] = inter.groupby("concentration").cumcount()
#err_data = inter.pivot(index="concentration", columns="rep")
#ym = err_data.mean(axis=1)
#err = np.vstack((err_data.std(axis=1),)*2)
#ax.errorbar(err_data.index * jit, ym, yerr=err, marker="", linestyle="", capsize=2, lw=0.5, color=color)

logistic_params = row[params]
logistic_params[1] = np.nan_to_num(np.log10(logistic_params[1]))
ys = logistic(xs, logistic_params)
ax.plot(xs, ys, label=grouper, color=color, lw=2)
x50 = 10**logistic_params[1]
z50 = logistic(x50, logistic_params)
ax.scatter([x50], [z50], color=color, marker="o", edgecolor='k', zorder=10)

ax.set_title(line_col)
ax.set_xlabel("Concentration")
ax.set_ylabel(point_col)
ax.set_xscale("log")
ax.set_xlim(xmin, xmax)
leg = ax.legend(bbox_to_anchor=(1,0.5), frameon=False, loc="center left")
for t in leg.texts:
text = t.get_text().replace("+", " + ")
t.set_text('\n'.join(textwrap.wrap(text, 15)))
sns.despine(fig, ax)

fig.tight_layout()

return fig


def find_columns(cols, *cols_to_find):
found = []
for ctf in cols_to_find:
if ctf in cols:
found.append(ctf)
continue

regex = re.compile(ctf)
for col in cols:
match = regex.search(col)
if match:
found.append(col)
break
else:
logger.warning(f"Cannot match [{ctf}] to any of [{cols}]")
sys.exit(2)

return found


def parse_args():
parser = argparse.ArgumentParser(
description=(
"Compute GR50 and IC50s for use with the Single Cell Biology "
"Lab Opera Phenix High Content Screening platform"
),
prog=__file__,
)

parser.add_argument(
"-i",
dest="hcs_file",
type=Path,
required=True,
help="Ouput CSV of the HCS processing pipeline",
)

parser.add_argument(
"-g0", "--gr50-t0",
dest="gr50_timepoint_0",
required=True,
help=(
"Specify measurement column to use as t_0 in GR50 calculation. "
"This can be the full column name or a regex; e.g. 'phenix-day1.*Sum.*'"
)
)

parser.add_argument(
"-g1", "--gr50-t1",
dest="gr50_timepoint_1",
required=True,
help="Same specification for '-g1'. Must specify both g0 and g1."
)

parser.add_argument(
"-p",
dest="output_prefix",
type=Path,
required=True,
help="Output prefix to save output files with",
)

parser.add_argument(
"-c",
dest="control_value",
default="DMSO",
)

parser.add_argument(
"-v", "--verbose", action="store_true",
help="Print extra information"
)

return parser.parse_args()


if __name__ == "__main__":
logging.basicConfig(
stream=sys.stdout,
level=logging.INFO,
format="%(asctime)s - %(name)s: %(levelname)s: %(message)s",
)
logger = logging.getLogger(__file__)

args = parse_args()
if args.verbose:
logger.setLevel(logging.DEBUG)
logger.debug(f"Parsed arguments: {vars(args)}")

starting = pd.read_csv(args.hcs_file, index_col=0)
t0_col, t1_col = find_columns(
starting.columns,
args.gr50_timepoint_0,
args.gr50_timepoint_1,
)

intermediate = compute_gr50(
starting, t0_col, t1_col,
groupby="Drug",
control=args.control_value
)

finalized = gr_metrics(
intermediate,
alpha=0.05,
gr_value="gr_value",
ic_value="t1",
keys=["Drug"]
)

gr50_fig = plot_grs(
intermediate,
finalized, groupby="Drug",
point_col="gr_value",
line_col="GR50"
)
ic50_fig = plot_grs(
intermediate,
finalized, groupby="Drug",
point_col="relative_count",
line_col="IC50"
)

intermediate_out = f"{args.output_prefix}_gr50-per-well.csv"
final_out = f"{args.output_prefix}_gr50.csv"
plot_out = f"{args.output_prefix}_gr50-plots.pdf"
intermediate.to_csv(intermediate_out)
finalized.to_csv(final_out)

pdf = PdfPages(plot_out)
pdf.savefig(gr50_fig)
pdf.savefig(ic50_fig)
pdf.close()
logger.debug(f"Plots saved to [{plot_out}].")
116 changes: 116 additions & 0 deletions post_processing/gr50_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""
This file is a slightly modified version of the `gr_metrics` code available here:
https://github.com/datarail/gr_metrics/blob/master/SRC/python/gr50/__init__.py
from commit e583df61a888cd3e8ba74bdfcf8ea13f817e67a3
"""

import pandas as pd
import numpy as np
import scipy.optimize, scipy.stats

def logistic(x, params):
einf, log10_mid, slope = params
emin = 1.0
mid = 10 ** log10_mid
return ( (emin-einf) / (1 + ((x/mid)**slope) ) ) + einf

def _logistic_inv(y, params):
einf, log10_mid, slope = params
emin = 1.0
mid = 10 ** log10_mid
if y >= min(emin, einf) and y <= max(emin, einf):
return mid * ( (y-emin) / (einf-y) ) ** (1/slope)
else:
return np.inf

def _flat(x, params):
y, = params
return y

def _rss(params, fn, xdata, ydata):
rss = 0.0
for x, y in zip(xdata, ydata):
rss += (y - fn(x, params)) ** 2
return rss

def _tss(ydata):
tss = 0.0
y_mean = ydata.mean()
for y in ydata:
tss += (y - y_mean) ** 2
return tss

def _rsquare(params, fn, xdata, ydata):
ss_res = _rss(params, fn, xdata, ydata)
ss_tot = _tss(ydata)
return 1 - ss_res / ss_tot

def _fit(fn, xdata, ydata, prior, bounds):
res = scipy.optimize.minimize(_rss, args=(fn, xdata, ydata),
x0=prior, bounds=bounds)
return res

def _calculate_pval(logistic_result, flat_result, n):
rss2 = logistic_result.fun
rss1 = flat_result.fun
df1 = len(logistic_result.x) - len(flat_result.x)
df2 = n - len(logistic_result.x) + 1
f = ( (rss1-rss2)/df1 ) / (rss2/df2)
pval = 1 - scipy.stats.f.cdf(f, df1, df2)
return pval

def _summarize(df, pval, alpha, logistic_result, flat_result, col):
if pval > alpha or not logistic_result.success:
if flat_result.x[0] > 0.5:
Z_50 = np.inf
else:
Z_50 = -np.inf
inf = flat_result.x[0]
EZ_50 = 0.0
slope = 0.01
r2 = _rsquare(flat_result.x, _flat, df.concentration, df[col])
else:
Z_50 = _logistic_inv(0.5, logistic_result.x)
inf = logistic_result.x[0]
EZ_50 = 10 ** logistic_result.x[1]
slope = logistic_result.x[2]
r2 = _rsquare(logistic_result.x, logistic, df.concentration, df[col])
max_ = min(df[col][-2:])
log_conc = np.log10(df.concentration)
aoc_width = log_conc.max() - log_conc.min()
aoc = np.trapz(1 - df[col], log_conc) / aoc_width
return [Z_50, max_, aoc, EZ_50, inf, slope, r2, pval]

def _metrics(df, gr_alpha=0.05, gr_value='GRvalue', ic_alpha=0.05, ic_value="t1"):
df = df.sort_values(by='concentration')
conc_min = df.concentration.min() / 100
conc_max = df.concentration.max() * 100

gr_bounds = np.array([[-1, 1], np.log10([conc_min, conc_max]), [0.1, 5]])
ic_bounds = np.array([[0, 1], np.log10([conc_min, conc_max]), [0.1, 5]])
prior = np.array([0.1, np.log10(np.median(df.concentration)), 2])

gr_logistic_result = _fit(logistic, df.concentration, df[gr_value],
prior, gr_bounds)
gr_flat_result = _fit(_flat, df.concentration, df[gr_value],
prior[[0]], gr_bounds[[0]])
gr_pval = _calculate_pval(gr_logistic_result, gr_flat_result, len(df.concentration))

ic_logistic_result = _fit(logistic, df.concentration, df[ic_value],
prior, ic_bounds)
ic_flat_result = _fit(_flat, df.concentration, df[ic_value],
prior[[0]], ic_bounds[[0]])
ic_pval = _calculate_pval(ic_logistic_result, ic_flat_result, len(df.concentration))

gr_data = _summarize(df, gr_pval, gr_alpha, gr_logistic_result, gr_flat_result, gr_value)
ic_data = _summarize(df, ic_pval, ic_alpha, ic_logistic_result, ic_flat_result, ic_value)

return gr_data + ic_data

def gr_metrics(data, alpha=0.05, gr_value='GRvalue', ic_value="t1", keys=["Drug"]):
gb = data.groupby(keys)
metric_columns = ['GR50', 'GRmax', 'GR_AOC', 'GEC50', 'GRinf', 'h_GR', 'r2_GR', 'pval_GR'] +\
"IC50,Emax,AUC,EC50,Einf,h,r2_rel_cell,pval_rel_cell".split(",")
data = [[k] + _metrics(v, alpha, gr_value, ic_value=ic_value) for k, v in gb]
df = pd.DataFrame(data, columns=keys + metric_columns)
return df
Loading

0 comments on commit 52f66b8

Please sign in to comment.