Skip to content

Commit

Permalink
allow experimental condition sample covaraintes unrelated to selection
Browse files Browse the repository at this point in the history
  • Loading branch information
jykr committed Nov 25, 2023
1 parent 526490f commit 84a03d0
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 50 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ File should contain following columns with header.
* `R1_filepath`: Path to read 1 `.fastq[.gz]` file
* `R2_filepath`: Path to read 1 `.fastq[.gz]` file
* `sample_id`: ID of sequencing sample
* `rep [Optional]`: Replicate # of this sample
* `rep [Optional]`: Replicate # of this sample (Should NOT contain `.`)
* `bin [Optional]`: Name of the sorting bin
* `upper_quantile [Optional]`: FACS sorting upper quantile
* `lower_quantile [Optional]`: FACS sorting lower quantile
Expand Down
18 changes: 15 additions & 3 deletions bean/framework/ReporterScreen.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,20 @@ def __getitem__(self, index):
new_uns = deepcopy(self.uns)
for k, df in adata.uns.items():
if k.startswith("repguide_mask"):
new_uns[k] = df.loc[guides_include, adata.var.rep.unique()]
if "sample_covariates" in adata.uns:
adata.var["_rc"] = adata.var[
["rep"] + adata.uns["sample_covariates"]
].values.tolist()
adata.var["_rc"] = adata.var["_rc"].map(
lambda slist: ".".join(slist)
)
new_uns[k] = df.loc[guides_include, adata.var._rc.unique()]
adata.var.pop("_rc")
else:
new_uns[k] = df.loc[guides_include, adata.var.rep.unique()]
if not isinstance(df, pd.DataFrame):
if k == "sample_covariates":
new_uns[k] = df
continue
if "guide" in df.columns:
if "allele" in df.columns:
Expand Down Expand Up @@ -892,7 +904,7 @@ def concat(screens: Collection[ReporterScreen], *args, axis=1, **kwargs):

if axis == 0:
for k in keys:
if k in ["target_base_change", "tiling"]:
if k in ["target_base_change", "tiling", "sample_covariates"]:
adata.uns[k] = screens[0].uns[k]
continue
elif "edit" not in k and "allele" not in k:
Expand All @@ -902,7 +914,7 @@ def concat(screens: Collection[ReporterScreen], *args, axis=1, **kwargs):
if axis == 1:
# If combining multiple samples, edit/allele tables should be merged.
for k in keys:
if k in ["target_base_change", "tiling"]:
if k in ["target_base_change", "tiling", "sample_covariates"]:
adata.uns[k] = screens[0].uns[k]
continue
elif "edit" not in k and "allele" not in k:
Expand Down
5 changes: 4 additions & 1 deletion bean/preprocessing/data_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import abc
import logging
from dataclasses import dataclass
from typing import Dict, Tuple
from typing import Dict, Tuple, List
from xmlrpc.client import Boolean
from copy import deepcopy
import torch
Expand Down Expand Up @@ -40,6 +40,7 @@ def __init__(
sample_mask_column: str = None,
shrink_alpha: bool = False,
condition_column: str = "sort",
sample_covariate_column: List[str] = [],
control_condition: str = "bulk",
accessibility_col: str = None,
accessibility_bw_path: str = None,
Expand All @@ -51,6 +52,7 @@ def __init__(
):
"""
Args
condition_column: By default, a single condition column, but you can optionally inlcude sample covariate column
control_can_be_selected: If True, screen.samples[condition_column] == control_condition can also be included in effect size inference if its condition column is not NA (Currently only suppoted for prolifertion screens).
"""
# TODO: remove replicate with too small number of (ex. only 1) sorting bin
Expand Down Expand Up @@ -821,6 +823,7 @@ def __init__(
sample_mask_column: str = None,
shrink_alpha: bool = False,
condition_column: str = "sort",
sample_covariate_column: List[str] = [],
control_condition: str = "bulk",
lower_quantile_column: str = "lower_quantile",
upper_quantile_column: str = "upper_quantile",
Expand Down
26 changes: 19 additions & 7 deletions bean/qc/guide_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,27 @@ def get_outlier_guides_and_mask(
abs_RPM_thres: RPM threshold value that will be used to define outlier guides.
"""
outlier_guides = get_outlier_guides(bdata, condit_col, mad_z_thres, abs_RPM_thres)
outlier_guides[replicate_col] = bdata.samples.loc[
outlier_guides["sample"], replicate_col
].values
mask = pd.DataFrame(
index=bdata.guides.index, columns=bdata.samples[replicate_col].unique()
).fillna(1)
if not isinstance(replicate_col, str):
outlier_guides["_rc"] = bdata.samples.loc[
outlier_guides["sample"], replicate_col
].values.tolist()
outlier_guides["_rc"] = outlier_guides["_rc"].map(lambda slist: ".".join(slist))
else:
outlier_guides[replicate_col] = bdata.samples.loc[
outlier_guides["sample"], replicate_col
].values
if isinstance(replicate_col, str):
reps = bdata.samples[replicate_col].unique()
else:
reps = bdata.samples[replicate_col].drop_duplicates().to_records(index=False)
reps = [".".join(slist) for slist in reps]
mask = pd.DataFrame(index=bdata.guides.index, columns=reps).fillna(1)
print(outlier_guides)
for _, row in outlier_guides.iterrows():
mask.loc[row["name"], row[replicate_col]] = 0
mask.loc[
row["name"], row[replicate_col if isinstance(replicate_col, str) else "_rc"]
] = 0

return outlier_guides, mask


Expand Down
72 changes: 53 additions & 19 deletions bean/qc/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import distutils
from typing import Union, List
import numpy as np
import pandas as pd
from copy import deepcopy
Expand Down Expand Up @@ -52,12 +53,23 @@ def parse_args():
type=str,
default="rep",
)
parser.add_argument(
"--sample-covariates",
help="Comma-separated list of column names in `bdata.samples` that describes non-selective experimental condition. (drug treatment, etc.)",
type=str,
default=None,
)
parser.add_argument(
"--condition-label",
help="Label of column in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)",
type=str,
default="bin",
)
parser.add_argument(
"--no-editing",
help="Ignore QC about editing. Can be used for QC of other editing modalities.",
action="store_true",
)
parser.add_argument(
"--target-pos-col",
help="Target position column in `bdata.guides` specifying target edit position in reporter",
Expand Down Expand Up @@ -143,21 +155,30 @@ def check_args(args):
)
args.lfc_cond1 = lfc_conds[0]
args.lfc_cond2 = lfc_conds[1]
if args.sample_covariates is not None:
if "," in args.sample_covariates:
args.sample_covariates = args.sample_covariates.split(",")
args.replicate_label = [args.replicate_label] + args.sample_covariates
else:
args.replicate_label = [args.replicate_label, args.sample_covariates]
if args.no_editing:
args.base_edit_data = False
else:
args.base_edit_data = True
return args


def _add_dummy_sample(bdata, rep, cond, condition_label: str, replicate_label: str):
def _add_dummy_sample(
bdata, rep, cond, condition_label: str, replicate_label: Union[str, List[str]]
):
sample_id = f"{rep}_{cond}"
cond_df = deepcopy(bdata.samples)
cond_df[replicate_label] = np.nan
cond_df = cond_df.drop_duplicates()
# cond_df = cond_df.drop_duplicates()
cond_row = cond_df.loc[cond_df[condition_label] == cond, :]
if not len(cond_row) == 1:
raise ValueError(
f"Non-unique condition specification in ReporterScreen.samples: {cond_row}"
)
if len(cond_row) != 1:
cond_row = cond_row.iloc[[0], :]
cond_row.index = [sample_id]
cond_row.loc[:, replicate_label] = rep
cond_row[replicate_label] = rep
dummy_sample_bdata = ReporterScreen(
X=np.zeros((bdata.n_obs, 1)),
X_bcmatch=np.zeros((bdata.n_obs, 1)),
Expand All @@ -175,27 +196,40 @@ def _add_dummy_sample(bdata, rep, cond, condition_label: str, replicate_label: s
return bdata


def fill_in_missing_samples(bdata, condition_label: str, replicate_label: str):
def fill_in_missing_samples(
bdata, condition_label: str, replicate_label: Union[str, List[str]]
):
"""If not all condition exists for every replicate in bdata, fill in fake sample"""
added_dummy = False
for rep in bdata.samples[replicate_label].unique():
if isinstance(replicate_label, str):
rep_list = bdata.samples[replicate_label].unique()
else:
rep_list = (
bdata.samples[replicate_label].drop_duplicates().to_records(index=False)
)
# print(rep_list)
for rep in rep_list:
for cond in bdata.samples[condition_label].unique():
if isinstance(replicate_label, str):
rep_samples = bdata.samples[replicate_label] == rep
else:
rep = list(rep)
rep_samples = (bdata.samples[replicate_label] == rep).all(axis=1)
if (
len(
np.where(
(bdata.samples[replicate_label] == rep)
& (bdata.samples[condition_label] == cond)
)[0]
)
len(np.where(rep_samples & (bdata.samples[condition_label] == cond))[0])
!= 1
):
print(f"Adding dummy samples for {rep}, {cond}")
bdata = _add_dummy_sample(
bdata, rep, cond, condition_label, replicate_label
)
if not added_dummy:
added_dummy = True
if added_dummy:
bdata = bdata[
:, bdata.samples.sort_values([replicate_label, condition_label]).index
]
if isinstance(replicate_label, str):
sort_labels = [replicate_label, condition_label]
else:
sort_labels = replicate_label + [condition_label]
bdata = bdata[:, bdata.samples.sort_values(sort_labels).index]

return bdata
1 change: 1 addition & 0 deletions bin/bean-qc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def main():
ctrl_cond=args.ctrl_cond,
exp_id=args.out_report_prefix,
recalculate_edits=args.recalculate_edits,
base_edit_data=args.base_edit_data,
),
kernel_name="bean_python3",
)
Expand Down
83 changes: 66 additions & 17 deletions notebooks/sample_quality_report.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@
"comp_cond2 = \"bot\"\n",
"ctrl_cond = \"bulk\"\n",
"recalculate_edits = False\n",
"tiling = None"
"tiling = None\n",
"base_edit_data = True"
]
},
{
Expand All @@ -75,7 +76,9 @@
"outputs": [],
"source": [
"if tiling is not None:\n",
" bdata.uns['tiling'] = tiling"
" bdata.uns['tiling'] = tiling\n",
"if not isinstance(replicate_label, str):\n",
" bdata.uns['sample_covariates'] = replicate_label[1:]"
]
},
{
Expand Down Expand Up @@ -208,20 +211,31 @@
"outputs": [],
"source": [
"selected_guides = bdata.guides[posctrl_col] == posctrl_val if posctrl_col else ~bdata.guides.index.isnull()\n",
"ax=pt.qc.plot_lfc_correlation(bdata, selected_guides, method=\"Spearman\", cond1=comp_cond1, cond2=comp_cond2, rep_col=replicate_label, compare_col=condition_label, figsize=(10,10))\n",
"\n",
"ax.set_title(\"top/bot LFC correlation, Spearman\")\n",
"plt.yticks(rotation=0) \n",
"plt.xticks(rotation=90) \n",
"plt.show()"
"print(f\"Calculating LFC correlation of {sum(selected_guides)} {'positive control' if posctrl_col else 'all'} guides.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"ax = pt.qc.plot_lfc_correlation(\n",
" bdata,\n",
" selected_guides,\n",
" method=\"Spearman\",\n",
" cond1=comp_cond1,\n",
" cond2=comp_cond2,\n",
" rep_col=replicate_label,\n",
" compare_col=condition_label,\n",
" figsize=(10, 10),\n",
")\n",
"\n",
"ax.set_title(\"top/bot LFC correlation, Spearman\")\n",
"plt.yticks(rotation=0)\n",
"plt.xticks(rotation=90)\n",
"plt.show()"
]
},
{
"cell_type": "code",
Expand All @@ -243,7 +257,12 @@
"metadata": {},
"outputs": [],
"source": [
"if recalculate_edits or \"edits\" not in bdata.layers.keys() or bdata.layers['edits'].max() == 0:\n",
"if \"target_base_change\" not in bdata.uns or not base_edit_data:\n",
" bdata.uns[\"target_base_change\"] = \"\"\n",
" base_edit_data = False\n",
" print(\"Not a base editing data or target base change not provided. Passing editing-related QC\")\n",
" edit_rate_threshold = -0.1\n",
"elif recalculate_edits or \"edits\" not in bdata.layers.keys() or bdata.layers['edits'].max() == 0:\n",
" if 'allele_counts' in bdata.uns.keys():\n",
" bdata.uns['allele_counts'] = bdata.uns['allele_counts'].loc[bdata.uns['allele_counts'].allele.map(str) != \"\"]\n",
" bdata.get_edit_from_allele()\n",
Expand All @@ -261,12 +280,22 @@
"metadata": {},
"outputs": [],
"source": [
"if \"edits\" in bdata.layers.keys():\n",
"if \"target_base_change\" not in bdata.uns or not base_edit_data:\n",
" print(\n",
" \"Not a base editing data or target base change not provided. Passing editing-related QC\"\n",
" )\n",
"elif \"edits\" in bdata.layers.keys():\n",
"\n",
" bdata.get_guide_edit_rate(\n",
"\n",
" editable_base_start=edit_quantification_start_pos,\n",
"\n",
" editable_base_end=edit_quantification_end_pos,\n",
"\n",
" unsorted_condition_label=ctrl_cond,\n",
"\n",
" )\n",
"\n",
" be.qc.plot_guide_edit_rates(bdata)"
]
},
Expand All @@ -276,11 +305,19 @@
"metadata": {},
"outputs": [],
"source": [
"if \"edits\" in bdata.layers.keys():\n",
"if \"target_base_change\" not in bdata.uns or not base_edit_data:\n",
" print(\n",
" \"Not a base editing data or target base change not provided. Passing editing-related QC\"\n",
" )\n",
"elif \"edits\" in bdata.layers.keys():\n",
"\n",
" bdata.get_edit_rate(\n",
" editable_base_start = edit_quantification_start_pos, \n",
" editable_base_end=edit_quantification_end_pos\n",
" editable_base_start=edit_quantification_start_pos,\n",
"\n",
" editable_base_end=edit_quantification_end_pos,\n",
"\n",
" )\n",
"\n",
" be.qc.plot_sample_edit_rates(bdata)"
]
},
Expand Down Expand Up @@ -329,12 +366,24 @@
"outputs": [],
"source": [
"# leave replicate with more than 1 sorting bin data\n",
"rep_n_samples = bdata_filtered.samples.groupby(replicate_label)['mask'].sum()\n",
"rep_n_samples = bdata_filtered.samples.groupby(replicate_label)[\"mask\"].sum()\n",
"print(rep_n_samples)\n",
"rep_has_too_small_sample = rep_n_samples.loc[rep_n_samples < 2].index.tolist()\n",
"rep_has_too_small_sample\n",
"print(f\"Excluding reps {rep_has_too_small_sample} that has less than 2 samples per replicate.\")\n",
"bdata_filtered = bdata_filtered[:, ~bdata_filtered.samples[replicate_label].isin(rep_has_too_small_sample)]"
"print(\n",
" f\"Excluding reps {rep_has_too_small_sample} that has less than 2 samples per replicate.\"\n",
")\n",
"if isinstance(replicate_label, str):\n",
" samples_include = ~bdata_filtered.samples[replicate_label].isin(\n",
" rep_has_too_small_sample\n",
" )\n",
"else:\n",
" bdata_filtered.samples[\"_rc\"] = bdata_filtered.samples[\n",
" replicate_label\n",
" ].values.tolist()\n",
" samples_include = ~bdata_filtered.samples[\"_rc\"].isin(rep_has_too_small_sample)\n",
"bdata_filtered = bdata_filtered[:, samples_include]\n",
"bdata_filtered.samples.pop(\"_rc\")"
]
},
{
Expand Down
Loading

0 comments on commit 84a03d0

Please sign in to comment.