Skip to content

Commit

Permalink
preliminary evaluation of network work based data
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Sep 20, 2024
1 parent ec0eeb6 commit 730b9e4
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 35 deletions.
143 changes: 127 additions & 16 deletions runs.ipynb
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -105,30 +110,136 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pending runs "
"# Marco data "
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 8133 × 22787\n",
" obs: 'cell_type', 'donor_id'\n",
" var: 'gene_ids', 'interval', 'mean', 'std'\n",
" uns: 'log1p'\n",
" layers: 'lognorm'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"__MACOSX\t\t\t\t shalek.h5ad\n",
"han.h5ad\t\t\t\t shalek_KDunion.csv\n",
"han_KDunion.csv\t\t\t\t shalek_chipunion.csv\n",
"han_chipunion.csv\t\t\t shalek_chipunion_KDUnion_intersect.csv\n",
"han_chipunion_KDUnion_intersect.csv\t zhao.h5ad\n",
"jackson.h5ad\t\t\t\t zhao_KDunion.csv\n",
"jackson_KDunion.csv\t\t\t zhao_chipunion.csv\n",
"jackson_chipunion.csv\t\t\t zhao_chipunion_KDUnion_intersect.csv\n",
"jackson_chipunion_KDUnion_intersect.csv\n"
]
}
],
"source": []
"source": [
"!ls resources_local/mccalla_extended/"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"zhao-KDunion. adata shape: (36199, 8442), GT size: (105136, 3), Gene overlap: (8425,)\n",
"/viash_automount/tmp/viash-run-regression_1-8RZIHd.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.020129 0.03817 0.009021\n",
"(3,) (3,)\n",
"zhao-chipunion. adata shape: (36199, 8442), GT size: (60662, 3), Gene overlap: (8378,)\n",
"/viash_automount/tmp/viash-run-regression_1-rLuWXt.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.006815 0.064118 0.028652\n",
"(3,) (3,)\n",
"zhao-chipunion_KDUnion_intersect. adata shape: (36199, 8442), GT size: (9019, 3), Gene overlap: (4493,)\n",
"/viash_automount/tmp/viash-run-regression_1-nsE1Jg.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.004249 0.000488 -0.00188\n",
"(3,) (3,)\n",
"shalek-KDunion. adata shape: (1211, 9411), GT size: (148047, 3), Gene overlap: (8784,)\n",
"/viash_automount/tmp/viash-run-regression_1-54532G.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.010243 0.104809 0.047283\n",
"(3,) (3,)\n",
"shalek-chipunion. adata shape: (1211, 9411), GT size: (96383, 3), Gene overlap: (8670,)\n",
"/viash_automount/tmp/viash-run-regression_1-9tO5h1.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.003727 0.042922 0.019598\n",
"(3,) (3,)\n",
"shalek-chipunion_KDUnion_intersect. adata shape: (1211, 9411), GT size: (67705, 3), Gene overlap: (7852,)\n",
"/viash_automount/tmp/viash-run-regression_1-5SN7ZR.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.002252 0.094459 0.046103\n",
"(3,) (3,)\n",
"han-KDunion. adata shape: (5520, 7465), GT size: (77400, 3), Gene overlap: (7387,)\n",
"/viash_automount/tmp/viash-run-regression_1-eSKe1T.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.019109 0.031554 0.006222\n",
"(3,) (3,)\n",
"han-chipunion. adata shape: (5520, 7465), GT size: (160038, 3), Gene overlap: (7458,)\n",
"/viash_automount/tmp/viash-run-regression_1-VnCHsA.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.013187 0.071785 0.029299\n",
"(3,) (3,)\n",
"han-chipunion_KDUnion_intersect. adata shape: (5520, 7465), GT size: (8463, 3), Gene overlap: (4141,)\n",
"/viash_automount/tmp/viash-run-regression_1-0OpbTu.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.004217 0.004052 -0.000082\n",
"(3,) (3,)\n",
"jackson-KDunion. adata shape: (17396, 5736), GT size: (27433, 3), Gene overlap: (4785,)\n",
"/viash_automount/tmp/viash-run-regression_1-wvATsv.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.078523 0.238183 0.07983\n",
"(3,) (3,)\n",
"jackson-chipunion. adata shape: (17396, 5736), GT size: (24481, 3), Gene overlap: (4898,)\n",
"/viash_automount/tmp/viash-run-regression_1-GudAWm.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 -0.027772 0.186608 0.079418\n",
"(3,) (3,)\n",
"jackson-chipunion_KDUnion_intersect. adata shape: (17396, 5736), GT size: (2661, 3), Gene overlap: (1515,)\n",
"/viash_automount/tmp/viash-run-regression_1-lrY1xV.py:56: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" output = ad.AnnData(\n",
" ex(False)_tf(-1) ex(True)_tf(-1) Mean\n",
"0 0.004694 0.321477 0.163086\n",
"(3,) (3,)\n"
]
}
],
"source": [
"import subprocess\n",
"import anndata as ad \n",
"import pandas as pd\n",
"import numpy as np\n",
"for cell_type in ['zhao', 'shalek', 'han', 'jackson']:\n",
" adata = ad.read_h5ad(f'resources_local/mccalla_extended/{cell_type}.h5ad')\n",
" adata.layers['norm'] = adata.X\n",
" adata.obs['cell_type'] = 'onecelltype'\n",
" adata.write(f'resources_local/mccalla_extended/{cell_type}.h5ad')\n",
" subsample = min([10000, len(adata)])\n",
" for GT in ['KDunion', 'chipunion', 'chipunion_KDUnion_intersect']:\n",
" GT_df = pd.read_csv(f'resources_local/mccalla_extended/{cell_type}_{GT}.csv')\n",
" gene_overlap = np.intersect1d(adata.var_names, GT_df.target.unique()).shape\n",
" print(f\"{cell_type}-{GT}. adata shape: {adata.shape}, GT size: {GT_df.shape}, Gene overlap: {gene_overlap}\")\n",
" command = f\"viash run src/metrics/regression_1/config.vsh.yaml -- --perturbation_data resources_local/mccalla_extended/{cell_type}.h5ad --prediction resources_local/mccalla_extended/{cell_type}_{GT}.csv --layer norm --subsample {subsample} --apply_tf false --tf_all resources/prior/tf_all.csv --max_n_links -1 --verbose 1 --score output/{cell_type}_{GT}.h5ad\"\n",
" subprocess.run(command, shell=True, check=True)"
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -346,7 +457,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# To be categorized"
"# Metacell"
]
},
{
Expand Down
4 changes: 4 additions & 0 deletions src/api/comp_metric.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ functionality:
- name: --max_n_links
type: integer
default: 50000
- name: --verbose
type: integer
default: 2
direction: input



Expand Down
2 changes: 2 additions & 0 deletions src/metrics/regression_1/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ functionality:
- type: python_script
path: script.py
- path: main.py
- path: /src/utils/util.py
dest: util.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
Expand Down
31 changes: 12 additions & 19 deletions src/metrics/regression_1/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,10 @@
import pandas as pd
import anndata as ad
from tqdm import tqdm

from util import verbose_print, process_links, verbose_tqdm
import os


def select_top_links(net, par):
print("Number of links reduced to ", par['max_n_links'])
net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
return net

class lightgbm_wrapper:
def __init__(self, params, max_workers=None):
self.params = params
Expand Down Expand Up @@ -117,10 +111,9 @@ def cross_validation(net, prturb_adata, par:dict):
feature_fraction=0.05, verbosity=-1)
regr = lightgbm_wrapper(params, max_workers)
else:
print(f'{reg_type} is not defined')
raise ValueError("define first")
raise ValueError(f"{reg_type} is not defined.")

for group in tqdm(unique_groups, desc="Processing groups"):
for group in verbose_tqdm(unique_groups, "Processing groups", 2, par['verbose']):
mask_va = groups == group
mask_tr = ~mask_va
X_tr = X[mask_tr & mask_shared_genes, :]
Expand All @@ -144,16 +137,17 @@ def regression_1(
cell_types = prturb_adata.obs.cell_type.unique()
score_list = []
for cell_type in cell_types:
print(f'----cross validate for {cell_type}----')
verbose_print(par['verbose'], f'----cross validate for {cell_type}----', 2)
# check if net is cell type specific
if 'cell_type' in net.columns:
if cell_type not in net.cell_type.unique():
raise ValueError(f'{cell_type} is not present in grn.')
net_sub = net[net.cell_type==cell_type]
else:
net_sub = net
if len(net_sub)>par['max_n_links']:
net_sub = select_top_links(net_sub, par) #only top links are considered
if (len(net_sub)>par['max_n_links']) and (par['max_n_links']!=-1):
net_sub = process_links(net_sub, par) #only top links are considered
verbose_print(par['verbose'], f"Number of links reduced to {par['max_n_links']}", 2)

prturb_adata_sub = prturb_adata[prturb_adata.obs.cell_type==cell_type,:]
y_true_sub, y_pred_sub = cross_validation(net_sub, prturb_adata_sub, par)
Expand Down Expand Up @@ -201,7 +195,7 @@ def main(par):
manipulate = None
## read and process input data

print('Reading input files', flush=True)
verbose_print(par['verbose'], 'Reading input files', 3)

perturbation_data = ad.read_h5ad(par['perturbation_data'])
tf_all = np.loadtxt(par['tf_all'], dtype=str)
Expand Down Expand Up @@ -236,14 +230,13 @@ def main(par):
perturbation_data = perturbation_data[mask,:]
else:
perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :]

print(perturbation_data.shape)

verbose_print(par['verbose'], perturbation_data.shape,4)
perturbation_data.X = perturbation_data.layers[layer]



print(f'Compute metrics for layer: {layer}', flush=True)
verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3)

tfs_cases = [-1]
if par['min_tf']:
tfs_cases += 30
Expand All @@ -253,7 +246,7 @@ def main(par):
par['exclude_missing_genes'] = exclude_missing_genes
par['tf_n'] = tf_n
run_key = f'ex({exclude_missing_genes})_tf({tf_n})'
print(run_key)
verbose_print(par['verbose'], f'Run for metric: {run_key}', 3)

output = regression_1(net, perturbation_data, par=par, max_workers=max_workers)

Expand Down
8 changes: 8 additions & 0 deletions src/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
from sklearn.preprocessing import StandardScaler
import scipy.sparse as sp

def verbose_print(verbose_level, message, level):
if level <= verbose_level:
print(message)
def verbose_tqdm(iterable, desc, level, verbose_level):
if level <= verbose_level:
return tqdm(iterable, desc=desc)
else:
return iterable # Return the iterable without a progress bar


def process_links(net, par):
Expand Down

0 comments on commit 730b9e4

Please sign in to comment.