Merge remote-tracking branch 'origin/master' into gh-pages

broadinstitute · Mar 19, 2024 · 5000397 · 5000397
2 parents 7ee5128 + 3572a76
commit 5000397
Show file tree

Hide file tree

Showing 13 changed files with 13,614 additions and 0 deletions.
diff --git a/pills/README.md b/pills/README.md
@@ -0,0 +1,3 @@
+# Readme
+
+This library contains basic examples that show how to interact and make use of the JUMP dataset.
diff --git a/pills/poetry.lock b/pills/poetry.lock
diff --git a/pills/pyproject.toml b/pills/pyproject.toml
@@ -0,0 +1,26 @@
+[tool.poetry]
+name = "pills"
+version = "0.0.0"
+description = "Examples that show how to interact with JUMP data"
+authors = ["Alan Munoz"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.9"
+boto3 = ">=1.33.1"
+matplotlib = "^3.8.2"
+polars = "^0.19.19"
+pyarrow = "^14.0.1"
+s3fs = "^2023.12.1"
+seaborn = "^0.13.2"
+
+[tool.poetry.group.dev.dependencies]
+jupyter = "^1.0.0"
+jupytext = "^1.15.0"
+pytest = "^7.4.1"
+ruff-lsp = "^0.0.48"
+ruff = "<0.2.0"
+
+[tool.jupytext.formats]
+"notebooks/" = ".qmd"
+"scripts/" = "py:percent"
diff --git a/pills/scripts/basic.py b/pills/scripts/basic.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env jupyter
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.16.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+# %% Overview [markdown]
+# # Basic JUMP data access
+# This is a tutorial on how to access
+# We will use polars to fetch the data frame lazily, with the help of s3fs and pyarrow.
+# We prefer lazy loading because the data can be too big to be handled in memory.
+# %% Imports
+import polars as pl
+from pyarrow.dataset import dataset
+from s3fs import S3FileSystem
+
+# %% [markdown]
+# The shapes of the available datasets are:
+# - crispr: Knock-out genetic perturbations.
+# - orf: Overexpression genetic perturbations.
+# - compounds: Chemical genetic perturbations.
+#
+# The aws paths of the dataframes are shown below:
+# %% Paths
+prefix = (
+    "s3://cellpainting-gallery/cpg0016-jump-integrated/source_all/workspace/profiles"
+)
+filepaths = dict(
+    crispr=f"{prefix}/chandrasekaran_2024_0000000/crispr/wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet",
+    orf=f"{prefix}/chandrasekaran_2024_0000000/orf/wellpos_cellcount_mad_outlier_nan_featselect_harmony.parquet",
+    compound=f"{prefix}/arevalo_2023_e834481/compound/mad_int_featselect_harmony.parquet/",
+)
+
+
+# %% [markdown]
+# We use a S3FileSystem to avoid the need of credentials.
+# %%
+def lazy_load(path: str) -> pl.DataFrame:
+    fs = S3FileSystem(anon=True)
+    myds = dataset(path, filesystem=fs)
+    df = pl.scan_pyarrow_dataset(myds)
+    return df
+
+
+# %% [markdown]
+# We will lazy load the data to visualise its columns
+# %%
+print("Width, or number of columns.")
+for name, path in filepaths.items():
+    data = lazy_load(path)
+    metadata_cols = [col for col in data.columns if col.startswith("Metadata")]
+    print(f"{name}: {data.width}, containing {len(metadata_cols)} metadata columns")
+
+# %% [markdown]
+# Let us now focus on the crispr dataset
+# %%
+data = lazy_load(filepaths["crispr"])
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,36 @@
+[tool.poetry]
+name = "jump_stories"
+version = "0.1.0"
+description = ""
+authors = ["Alan"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = ">=3.9,<3.11"
+PyYAML = "5.3.1"
+# plotly = "4.14.3"
+# dvc-s3 = "^2.23.0"
+jupyter = "^1.0.0"
+scikit-learn = "^1.3.0"
+scipy = ">=1.8.1"
+matplotlib = "^3.7.3"
+seaborn = "^0.12.2"
+broad-babel = "^0.1.9"
+copairs = {git = "https://github.com/cytomining/copairs.git", rev = "v0.4.0-alpha"}
+numpy = "*"
+fastcluster = "^1.2.6"
+pandas = "*"
+pyarrow = "^14.0.1"
+joblib = "^1.3.2"
+adbc-driver-sqlite = "^0.8.0"
+
+
+[tool.poetry.group.dev.dependencies]
+jupytext = "^1.15.2"
+ipdb = "^0.13.13"
+pyflakes = "^3.1.0"
+ruff-lsp = "^0.0.48"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/workspace/analysis/CD44_HAS2/1_correlations.py b/workspace/analysis/CD44_HAS2/1_correlations.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env jupyter
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.15.2
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+# %% [Markdown]
+"""
+Find correlations between CD44 and HAS2. Question inspired by Prof. Chonghui Chen from Baylor College of Medicine during ASCB2 2023.
+"""
+
+from itertools import product
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from broad_babel.query import run_query
+from copairs.compute import pairwise_cosine
+
+from utils import get_figs_dir, load_path
+
+figs_dir = get_figs_dir()
+df = load_path("../../profiles/harmonized_no_sphering_profiles.parquet")
+
+genes_of_interest = ("CD44", "HAS2")
+cd44_crispr = run_query(query="CD44", input_column="standard_key", output_column="*")
+
+gene_matches = {pert: dict() for pert in genes_of_interest}
+for gene in genes_of_interest:
+    results = list(
+        map(
+            lambda x: x,
+            run_query(query=gene, input_column="standard_key", output_column="*"),
+        )
+    )
+    for pert_type in ("crispr", "orf"):
+        result_array = [x[1] for x in results if x[0] == pert_type]
+        if len(result_array):
+            gene_matches[gene][pert_type] = result_array[0]
+
+# %% Calculate the correlations between has2 and cd44 based on CRISPR data
+
+
+feat_cols = [x for x in df.columns if not x.startswith("Metadata")]
+translator_crispr_jump_gene = {v.get("crispr"): k for k, v in gene_matches.items()}
+subset = df.loc[df["Metadata_JCP2022"].isin(translator_crispr_jump_gene)]
+subset.set_index(
+    subset["Metadata_JCP2022"].map(translator_crispr_jump_gene),
+    inplace=True,
+)
+
+data_subset_no_features = subset[feat_cols]
+n_samples = len(subset)
+all_pair_ixs = np.array(list(product(range(n_samples), range(n_samples))))
+cosine_distance = pairwise_cosine(
+    data_subset_no_features.values, all_pair_ixs, batch_size=min(200, n_samples)
+).reshape((n_samples, n_samples))
+cosine_df = (
+    pd.DataFrame(
+        data=cosine_distance,
+        index=subset.index,
+        columns=subset.index,
+    )
+    .sort_index(axis=0)
+    .sort_index(axis=1)
+)
+sns.heatmap(
+    cosine_df,
+    robust=True,
+)
+plt.tight_layout()
+plt.savefig(
+    figs_dir / f"heatmap_cosdist_harmony_{'_'.join(genes_of_interest).lower() }.png",
+    dpi=300,
+)
+plt.close()
+
+# %% TODO Once acquired, use ORF data to have a look at these profiles.
diff --git a/workspace/analysis/CD44_HAS2/utils.py b/workspace/analysis/CD44_HAS2/utils.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env jupyter
+
+import pandas as pd
+from pathlib import Path
+
+
+def load_path(path: str or Path):
+    return pd.read_parquet(Path(path))
+
+
+def get_figs_dir():
+    figs_dir = Path("../../figs")
+    figs_dir.mkdir(exist_ok=True)
+    return figs_dir
diff --git a/workspace/analysis/MYT1_RNF41/1_correlations.py b/workspace/analysis/MYT1_RNF41/1_correlations.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env jupyter
+#!/usr/bin/env jupyter
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.15.2
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+from itertools import product
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from broad_babel.query import run_query
+from copairs.compute import pairwise_cosine
+
+dataset_filepath = (
+    Path("..") / ".." / "profiles" / "harmonized_no_sphering_profiles.parquet"
+)
+figs_dir = Path("../figs")
+figs_dir.mkdir(exist_ok=True)
+
+df = pd.read_parquet(dataset_filepath)
+
+genes_of_interest = ("RNF41", "MYT1")
+# cd44_crispr = run_query(query="CD44", input_column="standard_key", output_column="*")
+
+gene_matches = {pert: dict() for pert in genes_of_interest}
+for gene in genes_of_interest:
+    results = list(
+        map(
+            lambda x: x,
+            run_query(query=gene, input_column="standard_key", output_column="*"),
+        )
+    )
+    for pert_type in ("crispr", "orf"):
+        result_array = [x[1] for x in results if x[0] == pert_type]
+        if len(result_array):
+            gene_matches[gene][pert_type] = result_array[0]
+
+
+# %% Calculate the correlations of myt1 and rnf41
+# They were reported to interact by collaborators, and Ardigen found them to correlate
+
+feat_cols = [x for x in df.columns if not x.startswith("Metadata")]
+translator_crispr_jump_gene = {v.get("crispr"): k for k, v in gene_matches.items()}
+subset = df.loc[df["Metadata_JCP2022"].isin(translator_crispr_jump_gene)]
+subset.set_index(
+    subset["Metadata_JCP2022"].map(translator_crispr_jump_gene),
+    inplace=True,
+)
+
+data_subset_no_features = subset[feat_cols]
+n_samples = len(subset)
+all_pair_ixs = np.array(list(product(range(n_samples), range(n_samples))))
+cosine_distance = pairwise_cosine(
+    data_subset_no_features.values, all_pair_ixs, batch_size=min(200, n_samples)
+).reshape((n_samples, n_samples))
+cosine_df = (
+    pd.DataFrame(
+        data=cosine_distance,
+        index=subset.index,
+        columns=subset.index,
+    )
+    .sort_index(axis=0)
+    .sort_index(axis=1)
+)
+sns.heatmap(
+    cosine_df,
+    robust=True,
+)
+plt.tight_layout()
+plt.savefig(
+    figs_dir / f"heatmap_cosdist_harmony_{'_'.join(genes_of_interest).lower() }.png",
+    dpi=300,
+)
+plt.close()
diff --git a/workspace/analysis/RAB40B/compare_features.py b/workspace/analysis/RAB40B/compare_features.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env jupyter
+
+# Linked issue: https://github.com/broadinstitute/2023_12_JUMP_data_only_vignettes/issues/4 apply_scaler,
+# Compare the four genes of interest that have been shown as anticorrelated:
+# - Cluster A:
+#  - RAB40B
+#  - RAB40C
+# - Cluster B:
+#  - INSYN1
+#  - PIK3R3
+
+from pathlib import Path
+
+import pandas as pd  # TODO stick to polars
+import polars as pl
+import scipy.stats as stats
+from statsmodels.stats.multitest import multipletests
+
+fpath = Path(
+    "/dgx1nas1/storage/data/shared/morphmap_profiles/orf/full_profiles_cc_adj_mean_corr.parquet"
+)
+genes_of_interest = ("RAB40B", "RAB40C", "INSYN1", "PIK3R3")
+
+
+df = pl.read_parquet(fpath)
+sub_df = df.filter(pl.col("Metadata_Symbol").is_in(genes_of_interest))
+groups = dict(
+    (x, f"{'RAB40B/C' if x.startswith('RAB') else 'INS/PIK'}")
+    for x in genes_of_interest
+)
+
+clusters = sub_df.with_columns(
+    pl.col("Metadata_Symbol").replace(groups).alias("Metadata_Cluster")
+)
+
+
+medians = clusters.group_by("Metadata_Cluster").median()
+diff = medians.select(pl.all().exclude("^Metadata.*$").diff())
+feature_diff = diff.filter(~pl.all_horizontal(pl.all().is_null())).melt()
+feature_diff.sort(by="value").write_csv("feature_diffs.csv")
+
+
+cluster_a = clusters.filter(pl.col("Metadata_Cluster") == "RAB40B/C").select(
+    pl.all().exclude("^Metadata.*$")
+)
+cluster_b = clusters.filter(pl.col("Metadata_Cluster") == "INS/PIK").select(
+    pl.all().exclude("^Metadata.*$")
+)
+results = []
+for feature in cluster_a.columns:
+    cluster_A = cluster_a.get_column(feature)
+    cluster_B = cluster_b.get_column(feature)
+
+    # Use t-test or Mann-Whitney U test based on data distribution
+    t_stat, p_val = stats.ttest_ind(cluster_A, cluster_B, equal_var=False)
+
+    results.append({"Feature": feature, "T-Statistic": t_stat, "P-Value": p_val})
+
+
+results_df = pd.DataFrame(results)
+
+# drop rows with NaN
+results_df = results_df.dropna()
+
+# Apply multiple testing correction
+
+corrected_pvals = multipletests(results_df["P-Value"], method="fdr_bh")[1]
+results_df["Corrected P-Value"] = corrected_pvals
+
+# Filter significant features
+significant_features = results_df[results_df["Corrected P-Value"] < 0.05]
+significant_features.to_csv("significant_features.csv")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Readme

		This library contains basic examples that show how to interact and make use of the JUMP dataset.