Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into gh-pages
Browse files Browse the repository at this point in the history
  • Loading branch information
afermg committed Mar 19, 2024
2 parents 7ee5128 + 3572a76 commit 5000397
Show file tree
Hide file tree
Showing 13 changed files with 13,614 additions and 0 deletions.
3 changes: 3 additions & 0 deletions pills/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Readme

This library contains basic examples that show how to interact and make use of the JUMP dataset.
3,796 changes: 3,796 additions & 0 deletions pills/poetry.lock

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions pills/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[tool.poetry]
name = "pills"
version = "0.0.0"
description = "Examples that show how to interact with JUMP data"
authors = ["Alan Munoz"]
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.9"
boto3 = ">=1.33.1"
matplotlib = "^3.8.2"
polars = "^0.19.19"
pyarrow = "^14.0.1"
s3fs = "^2023.12.1"
seaborn = "^0.13.2"

[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"
jupytext = "^1.15.0"
pytest = "^7.4.1"
ruff-lsp = "^0.0.48"
ruff = "<0.2.0"

[tool.jupytext.formats]
"notebooks/" = ".qmd"
"scripts/" = "py:percent"
65 changes: 65 additions & 0 deletions pills/scripts/basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env jupyter
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.16.1
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# %% Overview [markdown]
# # Basic JUMP data access
# This is a tutorial on how to access
# We will use polars to fetch the data frame lazily, with the help of s3fs and pyarrow.
# We prefer lazy loading because the data can be too big to be handled in memory.
# %% Imports
import polars as pl
from pyarrow.dataset import dataset
from s3fs import S3FileSystem

# %% [markdown]
# The shapes of the available datasets are:
# - crispr: Knock-out genetic perturbations.
# - orf: Overexpression genetic perturbations.
# - compounds: Chemical genetic perturbations.
#
# The aws paths of the dataframes are shown below:
# %% Paths
prefix = (
"s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles"
)
filepaths = dict(
crispr=f"{prefix}/chandrasekaran_2024_0000000/crispr/wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet",
orf=f"{prefix}/chandrasekaran_2024_0000000/orf/wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet",
compound=f"{prefix}/arevalo_2023_e834481/compound/mad_int_featselect_harmony.parquet/",
)


# %% [markdown]
# We use a S3FileSystem to avoid the need of credentials.
# %%
def lazy_load(path: str) -> pl.DataFrame:
fs = S3FileSystem(anon=True)
myds = dataset(path, filesystem=fs)
df = pl.scan_pyarrow_dataset(myds)
return df


# %% [markdown]
# We will lazy load the data to visualise its columns
# %%
print("Width, or number of columns.")
for name, path in filepaths.items():
data = lazy_load(path)
metadata_cols = [col for col in data.columns if col.startswith("Metadata")]
print(f"{name}: {data.width}, containing {len(metadata_cols)} metadata columns")

# %% [markdown]
# Let us now focus on the crispr dataset
# %%
data = lazy_load(filepaths["crispr"])
3,398 changes: 3,398 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[tool.poetry]
name = "jump_stories"
version = "0.1.0"
description = ""
authors = ["Alan"]
readme = "README.md"

[tool.poetry.dependencies]
python = ">=3.9,<3.11"
PyYAML = "5.3.1"
# plotly = "4.14.3"
# dvc-s3 = "^2.23.0"
jupyter = "^1.0.0"
scikit-learn = "^1.3.0"
scipy = ">=1.8.1"
matplotlib = "^3.7.3"
seaborn = "^0.12.2"
broad-babel = "^0.1.9"
copairs = {git = "https://github.com/cytomining/copairs.git", rev = "v0.4.0-alpha"}
numpy = "*"
fastcluster = "^1.2.6"
pandas = "*"
pyarrow = "^14.0.1"
joblib = "^1.3.2"
adbc-driver-sqlite = "^0.8.0"


[tool.poetry.group.dev.dependencies]
jupytext = "^1.15.2"
ipdb = "^0.13.13"
pyflakes = "^3.1.0"
ruff-lsp = "^0.0.48"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
88 changes: 88 additions & 0 deletions workspace/analysis/CD44_HAS2/1_correlations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env jupyter
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.15.2
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# %% [Markdown]
"""
Find correlations between CD44 and HAS2. Question inspired by Prof. Chonghui Chen from Baylor College of Medicine during ASCB2 2023.
"""

from itertools import product
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from broad_babel.query import run_query
from copairs.compute import pairwise_cosine

from utils import get_figs_dir, load_path

figs_dir = get_figs_dir()
df = load_path("../../profiles/harmonized_no_sphering_profiles.parquet")

genes_of_interest = ("CD44", "HAS2")
cd44_crispr = run_query(query="CD44", input_column="standard_key", output_column="*")

gene_matches = {pert: dict() for pert in genes_of_interest}
for gene in genes_of_interest:
results = list(
map(
lambda x: x,
run_query(query=gene, input_column="standard_key", output_column="*"),
)
)
for pert_type in ("crispr", "orf"):
result_array = [x[1] for x in results if x[0] == pert_type]
if len(result_array):
gene_matches[gene][pert_type] = result_array[0]

# %% Calculate the correlations between has2 and cd44 based on CRISPR data


feat_cols = [x for x in df.columns if not x.startswith("Metadata")]
translator_crispr_jump_gene = {v.get("crispr"): k for k, v in gene_matches.items()}
subset = df.loc[df["Metadata_JCP2022"].isin(translator_crispr_jump_gene)]
subset.set_index(
subset["Metadata_JCP2022"].map(translator_crispr_jump_gene),
inplace=True,
)

data_subset_no_features = subset[feat_cols]
n_samples = len(subset)
all_pair_ixs = np.array(list(product(range(n_samples), range(n_samples))))
cosine_distance = pairwise_cosine(
data_subset_no_features.values, all_pair_ixs, batch_size=min(200, n_samples)
).reshape((n_samples, n_samples))
cosine_df = (
pd.DataFrame(
data=cosine_distance,
index=subset.index,
columns=subset.index,
)
.sort_index(axis=0)
.sort_index(axis=1)
)
sns.heatmap(
cosine_df,
robust=True,
)
plt.tight_layout()
plt.savefig(
figs_dir / f"heatmap_cosdist_harmony_{'_'.join(genes_of_interest).lower() }.png",
dpi=300,
)
plt.close()

# %% TODO Once acquired, use ORF data to have a look at these profiles.
14 changes: 14 additions & 0 deletions workspace/analysis/CD44_HAS2/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env jupyter

import pandas as pd
from pathlib import Path


def load_path(path: str or Path):
return pd.read_parquet(Path(path))


def get_figs_dir():
figs_dir = Path("../../figs")
figs_dir.mkdir(exist_ok=True)
return figs_dir
86 changes: 86 additions & 0 deletions workspace/analysis/MYT1_RNF41/1_correlations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env jupyter
#!/usr/bin/env jupyter
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.15.2
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
from itertools import product
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from broad_babel.query import run_query
from copairs.compute import pairwise_cosine

dataset_filepath = (
Path("..") / ".." / "profiles" / "harmonized_no_sphering_profiles.parquet"
)
figs_dir = Path("../figs")
figs_dir.mkdir(exist_ok=True)

df = pd.read_parquet(dataset_filepath)

genes_of_interest = ("RNF41", "MYT1")
# cd44_crispr = run_query(query="CD44", input_column="standard_key", output_column="*")

gene_matches = {pert: dict() for pert in genes_of_interest}
for gene in genes_of_interest:
results = list(
map(
lambda x: x,
run_query(query=gene, input_column="standard_key", output_column="*"),
)
)
for pert_type in ("crispr", "orf"):
result_array = [x[1] for x in results if x[0] == pert_type]
if len(result_array):
gene_matches[gene][pert_type] = result_array[0]


# %% Calculate the correlations of myt1 and rnf41
# They were reported to interact by collaborators, and Ardigen found them to correlate

feat_cols = [x for x in df.columns if not x.startswith("Metadata")]
translator_crispr_jump_gene = {v.get("crispr"): k for k, v in gene_matches.items()}
subset = df.loc[df["Metadata_JCP2022"].isin(translator_crispr_jump_gene)]
subset.set_index(
subset["Metadata_JCP2022"].map(translator_crispr_jump_gene),
inplace=True,
)

data_subset_no_features = subset[feat_cols]
n_samples = len(subset)
all_pair_ixs = np.array(list(product(range(n_samples), range(n_samples))))
cosine_distance = pairwise_cosine(
data_subset_no_features.values, all_pair_ixs, batch_size=min(200, n_samples)
).reshape((n_samples, n_samples))
cosine_df = (
pd.DataFrame(
data=cosine_distance,
index=subset.index,
columns=subset.index,
)
.sort_index(axis=0)
.sort_index(axis=1)
)
sns.heatmap(
cosine_df,
robust=True,
)
plt.tight_layout()
plt.savefig(
figs_dir / f"heatmap_cosdist_harmony_{'_'.join(genes_of_interest).lower() }.png",
dpi=300,
)
plt.close()
72 changes: 72 additions & 0 deletions workspace/analysis/RAB40B/compare_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env jupyter

# Linked issue: https://github.com/broadinstitute/2023_12_JUMP_data_only_vignettes/issues/4 apply_scaler,
# Compare the four genes of interest that have been shown as anticorrelated:
# - Cluster A:
# - RAB40B
# - RAB40C
# - Cluster B:
# - INSYN1
# - PIK3R3

from pathlib import Path

import pandas as pd # TODO stick to polars
import polars as pl
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests

fpath = Path(
"/dgx1nas1/storage/data/shared/morphmap_profiles/orf/full_profiles_cc_adj_mean_corr.parquet"
)
genes_of_interest = ("RAB40B", "RAB40C", "INSYN1", "PIK3R3")


df = pl.read_parquet(fpath)
sub_df = df.filter(pl.col("Metadata_Symbol").is_in(genes_of_interest))
groups = dict(
(x, f"{'RAB40B/C' if x.startswith('RAB') else 'INS/PIK'}")
for x in genes_of_interest
)

clusters = sub_df.with_columns(
pl.col("Metadata_Symbol").replace(groups).alias("Metadata_Cluster")
)


medians = clusters.group_by("Metadata_Cluster").median()
diff = medians.select(pl.all().exclude("^Metadata.*$").diff())
feature_diff = diff.filter(~pl.all_horizontal(pl.all().is_null())).melt()
feature_diff.sort(by="value").write_csv("feature_diffs.csv")


cluster_a = clusters.filter(pl.col("Metadata_Cluster") == "RAB40B/C").select(
pl.all().exclude("^Metadata.*$")
)
cluster_b = clusters.filter(pl.col("Metadata_Cluster") == "INS/PIK").select(
pl.all().exclude("^Metadata.*$")
)
results = []
for feature in cluster_a.columns:
cluster_A = cluster_a.get_column(feature)
cluster_B = cluster_b.get_column(feature)

# Use t-test or Mann-Whitney U test based on data distribution
t_stat, p_val = stats.ttest_ind(cluster_A, cluster_B, equal_var=False)

results.append({"Feature": feature, "T-Statistic": t_stat, "P-Value": p_val})


results_df = pd.DataFrame(results)

# drop rows with NaN
results_df = results_df.dropna()

# Apply multiple testing correction

corrected_pvals = multipletests(results_df["P-Value"], method="fdr_bh")[1]
results_df["Corrected P-Value"] = corrected_pvals

# Filter significant features
significant_features = results_df[results_df["Corrected P-Value"] < 0.05]
significant_features.to_csv("significant_features.csv")
Loading

0 comments on commit 5000397

Please sign in to comment.