-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/master' into gh-pages
- Loading branch information
Showing
13 changed files
with
13,614 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Readme | ||
|
||
This library contains basic examples that show how to interact and make use of the JUMP dataset. |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
[tool.poetry] | ||
name = "pills" | ||
version = "0.0.0" | ||
description = "Examples that show how to interact with JUMP data" | ||
authors = ["Alan Munoz"] | ||
readme = "README.md" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.9" | ||
boto3 = ">=1.33.1" | ||
matplotlib = "^3.8.2" | ||
polars = "^0.19.19" | ||
pyarrow = "^14.0.1" | ||
s3fs = "^2023.12.1" | ||
seaborn = "^0.13.2" | ||
|
||
[tool.poetry.group.dev.dependencies] | ||
jupyter = "^1.0.0" | ||
jupytext = "^1.15.0" | ||
pytest = "^7.4.1" | ||
ruff-lsp = "^0.0.48" | ||
ruff = "<0.2.0" | ||
|
||
[tool.jupytext.formats] | ||
"notebooks/" = ".qmd" | ||
"scripts/" = "py:percent" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/usr/bin/env jupyter | ||
# --- | ||
# jupyter: | ||
# jupytext: | ||
# text_representation: | ||
# extension: .py | ||
# format_name: percent | ||
# format_version: '1.3' | ||
# jupytext_version: 1.16.1 | ||
# kernelspec: | ||
# display_name: Python 3 | ||
# language: python | ||
# name: python3 | ||
# --- | ||
# %% Overview [markdown] | ||
# # Basic JUMP data access | ||
# This is a tutorial on how to access | ||
# We will use polars to fetch the data frame lazily, with the help of s3fs and pyarrow. | ||
# We prefer lazy loading because the data can be too big to be handled in memory. | ||
# %% Imports | ||
import polars as pl | ||
from pyarrow.dataset import dataset | ||
from s3fs import S3FileSystem | ||
|
||
# %% [markdown] | ||
# The shapes of the available datasets are: | ||
# - crispr: Knock-out genetic perturbations. | ||
# - orf: Overexpression genetic perturbations. | ||
# - compounds: Chemical genetic perturbations. | ||
# | ||
# The aws paths of the dataframes are shown below: | ||
# %% Paths | ||
prefix = ( | ||
"s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles" | ||
) | ||
filepaths = dict( | ||
crispr=f"{prefix}/chandrasekaran_2024_0000000/crispr/wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet", | ||
orf=f"{prefix}/chandrasekaran_2024_0000000/orf/wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet", | ||
compound=f"{prefix}/arevalo_2023_e834481/compound/mad_int_featselect_harmony.parquet/", | ||
) | ||
|
||
|
||
# %% [markdown] | ||
# We use a S3FileSystem to avoid the need of credentials. | ||
# %% | ||
def lazy_load(path: str) -> pl.DataFrame: | ||
fs = S3FileSystem(anon=True) | ||
myds = dataset(path, filesystem=fs) | ||
df = pl.scan_pyarrow_dataset(myds) | ||
return df | ||
|
||
|
||
# %% [markdown] | ||
# We will lazy load the data to visualise its columns | ||
# %% | ||
print("Width, or number of columns.") | ||
for name, path in filepaths.items(): | ||
data = lazy_load(path) | ||
metadata_cols = [col for col in data.columns if col.startswith("Metadata")] | ||
print(f"{name}: {data.width}, containing {len(metadata_cols)} metadata columns") | ||
|
||
# %% [markdown] | ||
# Let us now focus on the crispr dataset | ||
# %% | ||
data = lazy_load(filepaths["crispr"]) |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
[tool.poetry] | ||
name = "jump_stories" | ||
version = "0.1.0" | ||
description = "" | ||
authors = ["Alan"] | ||
readme = "README.md" | ||
|
||
[tool.poetry.dependencies] | ||
python = ">=3.9,<3.11" | ||
PyYAML = "5.3.1" | ||
# plotly = "4.14.3" | ||
# dvc-s3 = "^2.23.0" | ||
jupyter = "^1.0.0" | ||
scikit-learn = "^1.3.0" | ||
scipy = ">=1.8.1" | ||
matplotlib = "^3.7.3" | ||
seaborn = "^0.12.2" | ||
broad-babel = "^0.1.9" | ||
copairs = {git = "https://github.com/cytomining/copairs.git", rev = "v0.4.0-alpha"} | ||
numpy = "*" | ||
fastcluster = "^1.2.6" | ||
pandas = "*" | ||
pyarrow = "^14.0.1" | ||
joblib = "^1.3.2" | ||
adbc-driver-sqlite = "^0.8.0" | ||
|
||
|
||
[tool.poetry.group.dev.dependencies] | ||
jupytext = "^1.15.2" | ||
ipdb = "^0.13.13" | ||
pyflakes = "^3.1.0" | ||
ruff-lsp = "^0.0.48" | ||
|
||
[build-system] | ||
requires = ["poetry-core"] | ||
build-backend = "poetry.core.masonry.api" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/env jupyter | ||
# --- | ||
# jupyter: | ||
# jupytext: | ||
# text_representation: | ||
# extension: .py | ||
# format_name: percent | ||
# format_version: '1.3' | ||
# jupytext_version: 1.15.2 | ||
# kernelspec: | ||
# display_name: Python 3 | ||
# language: python | ||
# name: python3 | ||
# --- | ||
# %% [Markdown] | ||
""" | ||
Find correlations between CD44 and HAS2. Question inspired by Prof. Chonghui Chen from Baylor College of Medicine during ASCB2 2023. | ||
""" | ||
|
||
from itertools import product | ||
from pathlib import Path | ||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
import pandas as pd | ||
import seaborn as sns | ||
from broad_babel.query import run_query | ||
from copairs.compute import pairwise_cosine | ||
|
||
from utils import get_figs_dir, load_path | ||
|
||
figs_dir = get_figs_dir() | ||
df = load_path("../../profiles/harmonized_no_sphering_profiles.parquet") | ||
|
||
genes_of_interest = ("CD44", "HAS2") | ||
cd44_crispr = run_query(query="CD44", input_column="standard_key", output_column="*") | ||
|
||
gene_matches = {pert: dict() for pert in genes_of_interest} | ||
for gene in genes_of_interest: | ||
results = list( | ||
map( | ||
lambda x: x, | ||
run_query(query=gene, input_column="standard_key", output_column="*"), | ||
) | ||
) | ||
for pert_type in ("crispr", "orf"): | ||
result_array = [x[1] for x in results if x[0] == pert_type] | ||
if len(result_array): | ||
gene_matches[gene][pert_type] = result_array[0] | ||
|
||
# %% Calculate the correlations between has2 and cd44 based on CRISPR data | ||
|
||
|
||
feat_cols = [x for x in df.columns if not x.startswith("Metadata")] | ||
translator_crispr_jump_gene = {v.get("crispr"): k for k, v in gene_matches.items()} | ||
subset = df.loc[df["Metadata_JCP2022"].isin(translator_crispr_jump_gene)] | ||
subset.set_index( | ||
subset["Metadata_JCP2022"].map(translator_crispr_jump_gene), | ||
inplace=True, | ||
) | ||
|
||
data_subset_no_features = subset[feat_cols] | ||
n_samples = len(subset) | ||
all_pair_ixs = np.array(list(product(range(n_samples), range(n_samples)))) | ||
cosine_distance = pairwise_cosine( | ||
data_subset_no_features.values, all_pair_ixs, batch_size=min(200, n_samples) | ||
).reshape((n_samples, n_samples)) | ||
cosine_df = ( | ||
pd.DataFrame( | ||
data=cosine_distance, | ||
index=subset.index, | ||
columns=subset.index, | ||
) | ||
.sort_index(axis=0) | ||
.sort_index(axis=1) | ||
) | ||
sns.heatmap( | ||
cosine_df, | ||
robust=True, | ||
) | ||
plt.tight_layout() | ||
plt.savefig( | ||
figs_dir / f"heatmap_cosdist_harmony_{'_'.join(genes_of_interest).lower() }.png", | ||
dpi=300, | ||
) | ||
plt.close() | ||
|
||
# %% TODO Once acquired, use ORF data to have a look at these profiles. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env jupyter | ||
|
||
import pandas as pd | ||
from pathlib import Path | ||
|
||
|
||
def load_path(path: str or Path): | ||
return pd.read_parquet(Path(path)) | ||
|
||
|
||
def get_figs_dir(): | ||
figs_dir = Path("../../figs") | ||
figs_dir.mkdir(exist_ok=True) | ||
return figs_dir |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/usr/bin/env jupyter | ||
#!/usr/bin/env jupyter | ||
# --- | ||
# jupyter: | ||
# jupytext: | ||
# text_representation: | ||
# extension: .py | ||
# format_name: percent | ||
# format_version: '1.3' | ||
# jupytext_version: 1.15.2 | ||
# kernelspec: | ||
# display_name: Python 3 | ||
# language: python | ||
# name: python3 | ||
# --- | ||
from itertools import product | ||
from pathlib import Path | ||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
import pandas as pd | ||
import seaborn as sns | ||
from broad_babel.query import run_query | ||
from copairs.compute import pairwise_cosine | ||
|
||
dataset_filepath = ( | ||
Path("..") / ".." / "profiles" / "harmonized_no_sphering_profiles.parquet" | ||
) | ||
figs_dir = Path("../figs") | ||
figs_dir.mkdir(exist_ok=True) | ||
|
||
df = pd.read_parquet(dataset_filepath) | ||
|
||
genes_of_interest = ("RNF41", "MYT1") | ||
# cd44_crispr = run_query(query="CD44", input_column="standard_key", output_column="*") | ||
|
||
gene_matches = {pert: dict() for pert in genes_of_interest} | ||
for gene in genes_of_interest: | ||
results = list( | ||
map( | ||
lambda x: x, | ||
run_query(query=gene, input_column="standard_key", output_column="*"), | ||
) | ||
) | ||
for pert_type in ("crispr", "orf"): | ||
result_array = [x[1] for x in results if x[0] == pert_type] | ||
if len(result_array): | ||
gene_matches[gene][pert_type] = result_array[0] | ||
|
||
|
||
# %% Calculate the correlations of myt1 and rnf41 | ||
# They were reported to interact by collaborators, and Ardigen found them to correlate | ||
|
||
feat_cols = [x for x in df.columns if not x.startswith("Metadata")] | ||
translator_crispr_jump_gene = {v.get("crispr"): k for k, v in gene_matches.items()} | ||
subset = df.loc[df["Metadata_JCP2022"].isin(translator_crispr_jump_gene)] | ||
subset.set_index( | ||
subset["Metadata_JCP2022"].map(translator_crispr_jump_gene), | ||
inplace=True, | ||
) | ||
|
||
data_subset_no_features = subset[feat_cols] | ||
n_samples = len(subset) | ||
all_pair_ixs = np.array(list(product(range(n_samples), range(n_samples)))) | ||
cosine_distance = pairwise_cosine( | ||
data_subset_no_features.values, all_pair_ixs, batch_size=min(200, n_samples) | ||
).reshape((n_samples, n_samples)) | ||
cosine_df = ( | ||
pd.DataFrame( | ||
data=cosine_distance, | ||
index=subset.index, | ||
columns=subset.index, | ||
) | ||
.sort_index(axis=0) | ||
.sort_index(axis=1) | ||
) | ||
sns.heatmap( | ||
cosine_df, | ||
robust=True, | ||
) | ||
plt.tight_layout() | ||
plt.savefig( | ||
figs_dir / f"heatmap_cosdist_harmony_{'_'.join(genes_of_interest).lower() }.png", | ||
dpi=300, | ||
) | ||
plt.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#!/usr/bin/env jupyter | ||
|
||
# Linked issue: https://github.com/broadinstitute/2023_12_JUMP_data_only_vignettes/issues/4 apply_scaler, | ||
# Compare the four genes of interest that have been shown as anticorrelated: | ||
# - Cluster A: | ||
# - RAB40B | ||
# - RAB40C | ||
# - Cluster B: | ||
# - INSYN1 | ||
# - PIK3R3 | ||
|
||
from pathlib import Path | ||
|
||
import pandas as pd # TODO stick to polars | ||
import polars as pl | ||
import scipy.stats as stats | ||
from statsmodels.stats.multitest import multipletests | ||
|
||
fpath = Path( | ||
"/dgx1nas1/storage/data/shared/morphmap_profiles/orf/full_profiles_cc_adj_mean_corr.parquet" | ||
) | ||
genes_of_interest = ("RAB40B", "RAB40C", "INSYN1", "PIK3R3") | ||
|
||
|
||
df = pl.read_parquet(fpath) | ||
sub_df = df.filter(pl.col("Metadata_Symbol").is_in(genes_of_interest)) | ||
groups = dict( | ||
(x, f"{'RAB40B/C' if x.startswith('RAB') else 'INS/PIK'}") | ||
for x in genes_of_interest | ||
) | ||
|
||
clusters = sub_df.with_columns( | ||
pl.col("Metadata_Symbol").replace(groups).alias("Metadata_Cluster") | ||
) | ||
|
||
|
||
medians = clusters.group_by("Metadata_Cluster").median() | ||
diff = medians.select(pl.all().exclude("^Metadata.*$").diff()) | ||
feature_diff = diff.filter(~pl.all_horizontal(pl.all().is_null())).melt() | ||
feature_diff.sort(by="value").write_csv("feature_diffs.csv") | ||
|
||
|
||
cluster_a = clusters.filter(pl.col("Metadata_Cluster") == "RAB40B/C").select( | ||
pl.all().exclude("^Metadata.*$") | ||
) | ||
cluster_b = clusters.filter(pl.col("Metadata_Cluster") == "INS/PIK").select( | ||
pl.all().exclude("^Metadata.*$") | ||
) | ||
results = [] | ||
for feature in cluster_a.columns: | ||
cluster_A = cluster_a.get_column(feature) | ||
cluster_B = cluster_b.get_column(feature) | ||
|
||
# Use t-test or Mann-Whitney U test based on data distribution | ||
t_stat, p_val = stats.ttest_ind(cluster_A, cluster_B, equal_var=False) | ||
|
||
results.append({"Feature": feature, "T-Statistic": t_stat, "P-Value": p_val}) | ||
|
||
|
||
results_df = pd.DataFrame(results) | ||
|
||
# drop rows with NaN | ||
results_df = results_df.dropna() | ||
|
||
# Apply multiple testing correction | ||
|
||
corrected_pvals = multipletests(results_df["P-Value"], method="fdr_bh")[1] | ||
results_df["Corrected P-Value"] = corrected_pvals | ||
|
||
# Filter significant features | ||
significant_features = results_df[results_df["Corrected P-Value"] < 0.05] | ||
significant_features.to_csv("significant_features.csv") |
Oops, something went wrong.