From 662b8821303a11b1ade2ef2e7ced2218f397c3c4 Mon Sep 17 00:00:00 2001 From: Jalil Nourisa Date: Mon, 18 Nov 2024 19:23:27 +0100 Subject: [PATCH] small update to the metrics --- runs.ipynb | 1303 +++++++++++------- src/control_methods/negative_control/main.py | 7 +- src/helper.py | 13 +- src/metrics/regression_1/main.py | 75 +- src/metrics/regression_1/script.py | 9 +- src/metrics/script_all.py | 142 +- src/workflows/run_benchmark/config.vsh.yaml | 2 +- 7 files changed, 1001 insertions(+), 550 deletions(-) diff --git a/runs.ipynb b/runs.ipynb index eeeea0f6..2b31db78 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -88,6 +88,7 @@ "# Suppress all FutureWarnings\n", "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", + "\n", "sys.path.append('../')\n", "from grn_benchmark.src.helper import surragate_names\n", "from src.helper import *\n", @@ -125,28 +126,18 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "portia\n", - "Job portia submitted successfully.\n", - "Submitted batch job 7824332\n", - "\n", - "grnboost2\n", - "Job grnboost2 submitted successfully.\n", - "Submitted batch job 7824333\n", - "\n", - "ppcor\n", - "Job ppcor submitted successfully.\n", - "Submitted batch job 7824334\n", - "\n", - "scenic\n", - "Job scenic submitted successfully.\n", - "Submitted batch job 7824335\n", + "negative_control\n", + "Job negative_control submitted successfully.\n", + "{'rna': 'resources/inference_datasets/norman_rna.h5ad', 'prediction': 'resources/grn_models/norman//negative_control.csv', 'tf_all': 'resources/prior/tf_all.csv', 'max_n_links': 50000, 'num_workers': '10'}\n", + "Reading input data\n", + "Inferring GRN\n", "\n" ] } @@ -156,6 +147,25 @@ " run_grn_inference()" ] }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "adamson_rna.h5ad norman_rna.h5ad op_atac.rds op_rna.rds\n", + "nakatake_rna.h5ad op_atac.h5ad op_rna.h5ad replogle2_rna.h5ad\n" + ] + } + ], + "source": [ + "# pd.read_csv('resources/grn_models/norman//pearson_corr.csv').weight.hist()\n", + "!ls resources/inference_datasets/" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -165,14 +175,14 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Submitted batch job 7761215\n" + "Submitted batch job 7838763\n" ] } ], @@ -186,371 +196,377 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 S1S2static-theta-0.0static-theta-0.5static-theta-1.0rankS1S2static-theta-0.0static-theta-0.5rank
collectri0.0000000.0000000.2593490.3155820.28046611
negative_control0.0000000.0000000.2171350.2996500.28086912
positive_control0.3645020.5595770.6494320.4791370.3011352
pearson_corr0.2652680.4548290.5865120.4511910.2940404
portia0.2022550.2940040.5098980.3716120.2881947
ppcor0.0000000.0000000.3623090.3352970.28106510
grnboost20.3366600.4127940.6181500.5294520.3224101
scenic0.1362900.1917580.5597050.4574820.3095826
granie0.0722780.0938520.1738090.2575470.27080513
scglue0.0612180.2564430.5121470.3409610.2837318
celloracle0.1818500.2329430.6035070.4722760.3005075
figr0.0966870.1661940.2871940.3482580.2858719
scenicplus0.2937610.3672130.6243300.5229030.3185793ANANSE_tissue_networks_lung.parquet0.0114920.0427440.1781320.26186811
ANANSE_tissue_networks_stomach.parquet0.0006240.0031180.1739320.25826814
ANANSE_tissue_networks_heart.parquet0.0020720.0108720.1817340.26000813
ANANSE_tissue_networks_bone_marrow.parquet0.0080270.0348960.1860200.26754410
gtex_rna_networks_Whole_Blood.parquet0.0994960.3797540.2162770.2697274
gtex_rna_networks_Brain_Amygdala.parquet0.0298520.1438700.1982480.2558899
gtex_rna_networks_Breast_Mammary_Tissue.parquet0.0565480.2376850.2112070.2597257
gtex_rna_networks_Lung.parquet0.0654440.2860070.2131010.2633495
gtex_rna_networks_Stomach.parquet0.0571470.2369250.2033690.2592238
cellnet_human_Hg1332_networks_bcell.parquet0.1589220.6938490.2564860.2765572
cellnet_human_Hg1332_networks_tcell.parquet0.1320440.6297000.2275770.2754643
cellnet_human_Hg1332_networks_skin.parquet0.0012190.1305180.1582230.24431417
cellnet_human_Hg1332_networks_neuron.parquet0.0013150.0760790.1742140.24730115
cellnet_human_Hg1332_networks_heart.parquet0.0018500.0554430.1775270.24831516
collectri0.0583150.1268990.2432310.2702256
negative_control-0.000923-0.0009610.1935450.26191912
positive_control0.7217501.1888030.6543070.4126471
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 13, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_scores = pd.read_csv(f\"resources/results/op/scores/50000-skeleton_False-binarize_True_pearson-ridge.csv\", index_col=0)\n", - "df_scores[df_scores<0] = 0\n", + "df_scores = pd.read_csv(f\"output/temp/op/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "# df_scores[df_scores<0] = 0\n", "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", "df_scores.style.background_gradient()" @@ -570,173 +586,193 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 S1S2static-theta-0.0static-theta-0.5rankS1S2static-theta-0.0static-theta-0.5rank
negative_control0.0000000.0000000.0040700.0075497negative_control0.0078900.0079400.0040700.0075496
positive_control0.0000000.4349210.0703340.0514722positive_control0.0144130.0195840.0703340.0514721
pearson_corr0.0000000.4235520.0680090.0493083pearson_corr-0.008077-0.0109130.0680090.0493084
portia0.0000000.4785170.0523680.0430754portia0.0051640.0052060.0523680.0430753
ppcor0.0000000.0000000.0071530.0096896ppcor-0.025308-0.0256070.0071530.0096897
grnboost20.0000000.5559280.0833470.0675461grnboost2-0.007698-0.0087280.0833470.0675462
scenic0.0000000.2566180.0315470.0213335scenic0.0050570.0068920.0315470.0213335
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 10, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_scores = pd.read_csv(f\"output/temp/replogle2/50000-skeleton_False-binarize_True_X-ridge.csv\", index_col=0)\n", - "df_scores[df_scores<0] = 0\n", + "df_scores = pd.read_csv(f\"output/temp/replogle2/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "# df_scores[df_scores<0] = 0\n", "\n", "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", @@ -757,137 +793,201 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 S1S2static-theta-0.0static-theta-0.5rankS1S2static-theta-0.0static-theta-0.5rank
negative_control0.0000000.0000000.0028040.0297855negative_control-0.000793-0.0009180.0012980.0276447
positive_control0.0005210.0014640.0697490.1883482
pearson_corr0.0021660.0060220.0373420.1457071
positive_control0.0000000.5618530.1808870.2160971portia0.0000220.0013930.0166850.0537865
pearson_corr0.0000000.5642980.1477500.1624042ppcor-0.000075-0.0004310.0086240.0321666
portia0.1516030.0438960.0723150.0566404grnboost2-0.001241-0.0019250.0311320.1597444
grnboost20.0000000.3575020.1082300.1590363scenic0.0039320.0068270.0052070.0764173
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 20, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_scores = pd.read_csv(f\"output/temp/nakatake/50000-skeleton_False-binarize_True_X-ridge.csv\", index_col=0)\n", - "df_scores[df_scores<0] = 0\n", + "df_scores = pd.read_csv(f\"output/temp/nakatake/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "# df_scores[df_scores<0] = 0\n", "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", "df_scores.style.background_gradient()" @@ -924,92 +1024,355 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 S1S2static-theta-0.0static-theta-0.5rankS1S2static-theta-0.0static-theta-0.5rank
negative_control-0.024738-0.0569550.0257190.0454123positive_control-0.003265-0.0032720.1665730.0811663
pearson_corr0.0062650.0062750.1632430.0781151
portia-0.002045-0.0020450.1123950.0547945
ppcor0.0050090.0050090.1262980.0570142
positive_control0.0029880.6491370.1225820.0924331grnboost2-0.010134-0.0101610.1777970.0932984
pearson_corr0.0288590.5328260.0710620.0727472scenic-0.001991-0.0118450.0994350.0386606
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 19, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_scores = pd.read_csv(f\"output/temp/norman/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "# df_scores[df_scores<0] = 0\n", + "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", + "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", + "df_scores.style.background_gradient()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 S1S2static-theta-0.0static-theta-0.5rank
negative_control-0.002391-0.0023910.0528930.0826945
positive_control-0.023432-0.0295900.2010720.1453794
pearson_corr0.0051450.0063370.1977010.1442721
portia-0.023758-0.0242760.0979250.1075326
ppcor-0.008878-0.0088780.1023010.1072253
grnboost2-0.020748-0.0246960.2281830.1774322
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_scores = pd.read_csv(f\"output/temp/nakatake/10000-skeleton_False-binarize_False_X-ridge.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"output/temp/adamson/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "# df_scores[df_scores<0] = 0\n", "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", "df_scores.style.background_gradient()" diff --git a/src/control_methods/negative_control/main.py b/src/control_methods/negative_control/main.py index 1f934d8c..42fe76dc 100644 --- a/src/control_methods/negative_control/main.py +++ b/src/control_methods/negative_control/main.py @@ -6,7 +6,10 @@ def process_links(net, par): net = net[net.source!=net.target] - net = net.sample(par['max_n_links']) + try: + net = net.sample(par['max_n_links']) + except: + net = net.sample(par['max_n_links'], replace=True) return net def main(par): print('Reading input data') @@ -29,7 +32,9 @@ def create_negative_control(gene_names) -> np.ndarray: pivoted_net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight') pivoted_net = pivoted_net.rename(columns={'index': 'target'}) + pivoted_net = pivoted_net[pivoted_net['weight'] != 0] + pivoted_net = process_links(pivoted_net, par) return pivoted_net diff --git a/src/helper.py b/src/helper.py index e5b2e3c4..80ced594 100644 --- a/src/helper.py +++ b/src/helper.py @@ -81,13 +81,12 @@ def run_grn_seqera(): raise ValueError('define first') def run_grn_inference(): - # - - dataset = 'nakatake' # 'replogle2' 'op' - sbatch = True + dataset = 'norman' # 'replogle2' 'op' norman + sbatch = False partition='cpu' - if dataset == 'op': + if dataset in ['op', 'norman']: normalize = True else: normalize = False @@ -95,14 +94,14 @@ def run_grn_inference(): par = { # 'methods': ["positive_control", "negative_control", "pearson_corr", "portia", "grnboost2", "ppcor", "scenic"], - 'methods': ["portia", "grnboost2", "ppcor", "scenic"], + 'methods': ["negative_control"], 'models_dir': f'resources/grn_models/{dataset}/', 'rna': f'resources/inference_datasets/{dataset}_rna.h5ad', 'rna_positive_control': f'resources/datasets_raw/{dataset}.h5ad', 'num_workers': 10, - 'mem': "64GB", - 'time': "12:00:00", + 'mem': "120GB", + 'time': "24:00:00", 'normalize': normalize, # 'max_n_links': 10000, } diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py index fe73f467..5d90a670 100644 --- a/src/metrics/regression_1/main.py +++ b/src/metrics/regression_1/main.py @@ -68,34 +68,33 @@ def cv_5(genes_n): return groups def cross_validation(net, prturb_adata, par:dict): + np.random.seed(32) + gene_names = prturb_adata.var_names + net = process_net(net.copy(), gene_names) gene_names_grn = net.index.to_numpy() n_tfs = net.shape[1] # construct feature and target space - if par['exclude_missing_genes']: - included_genes = gene_names_grn - else: - included_genes = gene_names - - X_df = pd.DataFrame(np.zeros((len(included_genes), n_tfs)), index=included_genes) + X_df = pd.DataFrame(np.zeros((len(gene_names), n_tfs)), index=gene_names) try: train_df = pd.DataFrame(prturb_adata.X, columns=gene_names).T except: train_df = pd.DataFrame(prturb_adata.X.todense().A, columns=gene_names).T - Y_df = train_df.loc[included_genes,:] + Y_df = train_df.loc[gene_names,:] mask_gene_net = X_df.index.isin(net.index) # fill the actual regulatory links X_df.loc[mask_gene_net, :] = net.values X = X_df.values.copy() + # run cv - groups = cv_5(len(included_genes)) + groups = cv_5(len(gene_names)) # initialize y_pred with the mean of gene expressed across all samples y_pred = Y_df.copy() means = Y_df.mean(axis=0) @@ -128,32 +127,62 @@ def cross_validation(net, prturb_adata, par:dict): raise ValueError(f"{reg_type} is not defined.") reg_models = [] - + n_baselines = 100 + y_pred_baselines = [y_pred.copy() for i in range(n_baselines)] for group in verbose_tqdm(unique_groups, "Processing groups", 2, par['verbose']): mask_va = groups == group mask_tr = ~mask_va X_tr = X[mask_tr & mask_gene_net, :] Y_tr = Y[mask_tr & mask_gene_net, :] - print(X.shape, Y.shape, X_tr.shape, Y_tr.shape, X[mask_va & mask_gene_net, :].shape) + X_va = X[mask_va & mask_gene_net, :] if X_tr.shape[0]<2: continue regr.fit(X_tr, Y_tr) - y_pred[mask_va & mask_gene_net, :] = regr.predict(X[mask_va & mask_gene_net, :]) + Y_pr = regr.predict(X_va) + y_pred[mask_va & mask_gene_net, :] = Y_pr.copy() + + # Shuffle each column independently and keep the same shape + for i in range(n_baselines): + for col in range(Y_pr.shape[1]): + np.random.shuffle(Y_pr[:, col]) + y_pred_baselines[i][mask_va & mask_gene_net, :] = Y_pr reg_models.append(regr) - + + # - normalized r2 score r2score = r2_score(y_true, y_pred, multioutput='variance_weighted') + r2score_baselines = [] + for y_pred_baseline in y_pred_baselines: + r2score_baselines.append(r2_score(y_true, y_pred_baseline, multioutput='variance_weighted')) + r2score_n = r2score - np.mean(r2score_baselines) + + # - normalized r2 score - S2 + r2score = r2_score(y_true[mask_gene_net, :], y_pred[mask_gene_net, :], multioutput='variance_weighted') + r2score_baselines = [] + for y_pred_baseline in y_pred_baselines: + r2score_baselines.append(r2_score(y_true[mask_gene_net, :], y_pred_baseline[mask_gene_net, :], multioutput='variance_weighted')) + r2score_n_s2 = r2score - np.mean(r2score_baselines) + + # - normalized r2 scores per sample r2score_samples = [] for i_sample in range(y_true.shape[1]): - score_sample = r2_score(y_true[:, i_sample], y_pred[:, i_sample], multioutput='variance_weighted') - r2score_samples.append(score_sample) + score_sample = r2_score(y_true[:, i_sample], y_pred[:, i_sample]) + r2score_baselines = [] + for y_pred_baseline in y_pred_baselines: + r2score_baselines.append(r2_score(y_true[:, i_sample], y_pred_baseline[:, i_sample])) + r2score_samples.append(score_sample - np.mean(r2score_baselines)) + + results = { - 'r2score-aver': r2score, + 'r2score-aver': r2score_n, + 'r2score-aver-s2': r2score_n_s2, 'r2scores': r2score_samples, 'reg_models' : reg_models } + + return results def regression_1( @@ -173,6 +202,7 @@ def regression_1( else: donor_ids = ['default'] score_list = [] + score_s2_list = [] for donor_id in donor_ids: verbose_print(par['verbose'], f'----cross validate for {donor_id}----', 2) # check if net is donor specific @@ -196,8 +226,11 @@ def regression_1( score_list.append(results['r2score-aver']) - mean_score_r2 = np.mean(score_list) - output = dict(mean_score_r2=mean_score_r2) + score_s2_list.append(results['r2score-aver-s2']) + s1 = [np.mean(score_list)] + s2 = [np.mean(score_s2_list)] + + output = dict(S1=s1, S2=s2) return output def set_global_seed(seed): @@ -296,11 +329,9 @@ def main(par): verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3) - results = {} - par['exclude_missing_genes'] = False - results['S1'] = [regression_1(net, evaluation_data, par=par, max_workers=max_workers)['mean_score_r2']] - par['exclude_missing_genes'] = True - results['S2'] = [regression_1(net, evaluation_data, par=par, max_workers=max_workers)['mean_score_r2']] + + results = regression_1(net, evaluation_data, par=par, max_workers=max_workers) + df_results = pd.DataFrame(results) return df_results \ No newline at end of file diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py index 1e1405bc..3086d3c6 100644 --- a/src/metrics/regression_1/script.py +++ b/src/metrics/regression_1/script.py @@ -3,18 +3,21 @@ import sys import numpy as np + +file_name = 'op' #nakatake + ## VIASH START par = { - "evaluation_data": "resources/evaluation_datasets/nakatake_perturbation.h5ad", + "evaluation_data": f"resources/evaluation_datasets/{file_name}_perturbation.h5ad", "tf_all": "resources/prior/tf_all.csv", # "prediction": "output/pearson_net.csv", - "prediction": "resources/grn_models/nakatake/portia.csv", + "prediction": f"resources/grn_models/{file_name}/grnboost2.csv", "method_id": "scenic", "max_n_links": 50000, "apply_tf": True, 'score': 'output/score.h5ad', 'reg_type': 'ridge', - 'layer': 'X', + 'layer': 'pearson', 'subsample': -1, 'num_workers': 4, 'skeleton': 'resources/prior/skeleton.csv', diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py index aa7d0b7a..2c2a9809 100644 --- a/src/metrics/script_all.py +++ b/src/metrics/script_all.py @@ -4,37 +4,59 @@ import numpy as np import os -## VIASH START -dataset = 'nakatake' #replogle2, op, nakatake +def define_par(dataset): + if dataset == 'op': + layer = 'pearson' + elif dataset in ['norman']: + layer = 'pearson' + elif dataset in ['replogle2', 'nakatake', 'adamson']: + layer = 'X' + else: + raise ValueError('define first') -if dataset == 'op': - layer = 'pearson' -elif dataset in ['replogle2', 'nakatake']: - layer = 'X' -else: - raise ValueError('define first') - -if True: #op par = { - 'reg_type': 'ridge', - 'models_dir': f"resources/grn_models/{dataset}", - 'scores_dir': f"output/temp/{dataset}", - - 'models': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], + 'reg_type': 'ridge', + 'models_dir': f"resources/grn_models/{dataset}", + 'scores_dir': f"output/temp/{dataset}", + + 'models': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], - "evaluation_data": f"resources/evaluation_datasets/{dataset}_perturbation.h5ad", - 'consensus': f'resources/prior/{dataset}_consensus-num-regulators.json', + # 'models': [ 'positive_control', 'pearson_corr'], + 'global_models': [ + 'ANANSE_tissue/networks/lung.parquet', + 'ANANSE_tissue/networks/stomach.parquet', + 'ANANSE_tissue/networks/heart.parquet', + 'ANANSE_tissue/networks/bone_marrow.parquet', + + 'gtex_rna/networks/Whole_Blood.parquet', + 'gtex_rna/networks/Brain_Amygdala.parquet', + 'gtex_rna/networks/Breast_Mammary_Tissue.parquet', + 'gtex_rna/networks/Lung.parquet', + 'gtex_rna/networks/Stomach.parquet', - 'layers': [layer], - - "tf_all": "resources/prior/tf_all.csv", - 'skeleton': 'resources/prior/skeleton.csv', - "apply_tf": True, - 'subsample': -1, - 'verbose': 4, - 'num_workers': 20 -} + + 'cellnet_human_Hg1332/networks/bcell.parquet', + 'cellnet_human_Hg1332/networks/tcell.parquet', + 'cellnet_human_Hg1332/networks/skin.parquet', + 'cellnet_human_Hg1332/networks/neuron.parquet', + 'cellnet_human_Hg1332/networks/heart.parquet', + ], + 'global_models_dir': '../eric/network_collection/networks/', + + "evaluation_data": f"resources/evaluation_datasets/{dataset}_perturbation.h5ad", + 'consensus': f'resources/prior/{dataset}_consensus-num-regulators.json', + + 'layer': layer, + + "tf_all": "resources/prior/tf_all.csv", + 'skeleton': 'resources/prior/skeleton.csv', + "apply_tf": True, + 'subsample': -1, + 'verbose': 4, + 'num_workers': 20 + } + return par meta = { @@ -44,38 +66,66 @@ sys.path.append(meta["resources_dir"]) sys.path.append(meta["util"]) -os.makedirs(par['scores_dir'], exist_ok=True) +from regression_1.main import main as main_reg1 +from regression_1.main import binarize_weight +from regression_2.main import main as main_reg2 +from util import process_links # - run consensus -from consensus.script import main -main(par) +from consensus.script import main as main_consensus + +# - include general models? +global_models = False # - run metrics -for binarize in [False, True]: - par['binarize'] = binarize - for max_n_links in [50000, 10000]: - par['max_n_links'] = max_n_links - for apply_skeleton in [False]: - par['apply_skeleton'] = apply_skeleton - for layer in par['layers']: - par['layer'] = layer - i = 0 +for dataset in ['norman', 'adamson']: #'replogle2', 'nakatake', norman + print('------ ', dataset, '------') + par = define_par(dataset) + os.makedirs(par['scores_dir'], exist_ok=True) + main_consensus(par) + for binarize in [True]: + par['binarize'] = binarize + for max_n_links in [50000]: + par['max_n_links'] = max_n_links + for apply_skeleton in [False]: + par['apply_skeleton'] = apply_skeleton + # - determines models to run + grn_files_dict = {} + # - add global models + if global_models: + for model in par['global_models']: + temp_dir = f"{par['scores_dir']}/nets/" + os.makedirs(temp_dir, exist_ok=True) + net = pd.read_parquet(f"{par['global_models_dir']}/{model}") + net.columns = ['source','target','weight'] + net = process_links(net, par) + if par['binarize']: + net['weight'] = net['weight'].apply(binarize_weight) + model = model.replace('/','_') + grn_file = f'{temp_dir}/{model}.csv' + net.to_csv(grn_file) + grn_files_dict[model] = grn_file + # - add actual models for model in par['models']: print(model) - par['prediction'] = f"{par['models_dir']}/{model}.csv" - if not os.path.exists(par['prediction']): - print(f"{par['prediction']} doesnt exist. Skipped.") + grn_file = f"{par['models_dir']}/{model}.csv" + if not os.path.exists(grn_file): + print(f"{grn_file} doesnt exist. Skipped.") continue - from regression_1.main import main - reg1 = main(par) - from regression_2.main import main - reg2 = main(par) + grn_files_dict[model] = grn_file + + # - actual runs + i = 0 + for model, grn_file in grn_files_dict.items(): + par['prediction'] = grn_file + reg1 = main_reg1(par) + reg2 = main_reg2(par) score = pd.concat([reg1, reg2], axis=1) score.index = [model] if i==0: df_all = score else: df_all = pd.concat([df_all, score]) - df_all.to_csv(f"{par['scores_dir']}/{max_n_links}-skeleton_{apply_skeleton}-binarize_{binarize}_{layer}-{par['reg_type']}.csv") + df_all.to_csv(f"{par['scores_dir']}/{max_n_links}-skeleton_{apply_skeleton}-binarize_{binarize}-{par['reg_type']}.csv") print(df_all) i+=1 diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 6ffe9739..a4794840 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -96,7 +96,7 @@ functionality: - name: grn_methods/portia - name: grn_methods/grnboost2 - name: grn_methods/scenic - - name: grn_methods/genie3 + # - name: grn_methods/genie3 - name: grn_methods/ppcor #needs docker image # - name: grn_methods/scgpt