workflow reduced to no on the fly models

openproblems-bio · Sep 13, 2024 · 7862520 · 7862520
1 parent b9385f2
commit 7862520
Show file tree

Hide file tree

Showing 11 changed files with 533 additions and 410 deletions.
diff --git a/runs.ipynb b/runs.ipynb
diff --git a/scripts/repo/run_grn_evaluation copy.sh b/scripts/repo/run_grn_evaluation copy.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+# RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+# reg_type=${1} #GB, ridge
+viash ns build --parallel
+reg_type=ridge
+
+RUN_ID="grn_evaluation_all_${reg_type}"
+resources_dir="s3://openproblems-data/resources/grn"
+# resources_dir="./resources"
+publish_dir="${resources_dir}/results/${RUN_ID}"
+grn_models_folder="${resources_dir}/grn_models"
+
+subsample=-2
+max_workers=10
+layer=scgen_pearson
+metric_ids="[regression_1, regression_2]"
+
+param_file="./params/${RUN_ID}.yaml"
+
+grn_names=(
+    "scglue"
+    "scenicplus"
+    "celloracle"
+    "granie"
+    "figr"
+    "collectri"
+    "genie3"
+    "grnboost2"
+    "ppcor"
+    "portia"
+    )
+# Start writing to the YAML file
+cat > $param_file << HERE
+param_list:
+HERE
+
+append_entry() {
+  cat >> $param_file << HERE
+  - id: ${reg_type}_${1}
+    metric_ids: ${metric_ids}
+    perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
+    multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad
+    reg_type: $reg_type
+    method_id: $1
+    subsample: $subsample
+    max_workers: $max_workers
+    tf_all: ${resources_dir}/prior/tf_all.csv
+    layer: ${layer}
+    consensus: ${resources_dir}/prior/consensus-num-regulators.json
+    prediction: ${grn_models_folder}/$1.csv
+HERE
+}
+
+append_entry_control() {
+  cat >> $param_file << HERE
+  - id: ${reg_type}_${1}
+    metric_ids: ${metric_ids}
+    perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
+    multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad
+    reg_type: $reg_type
+    method_id: $1
+    subsample: $subsample
+    max_workers: $max_workers
+    tf_all: ${resources_dir}/prior/tf_all.csv
+    layer: ${layer}
+    consensus: ${resources_dir}/prior/consensus-num-regulators.json
+    causal: ${2}
+    corr_method: ${3}
+    prediction: ${resources_dir}/grn_models/collectri.csv
+    cell_type_specific:  ${4}
+    metacell:  ${5}
+    impute: ${6}
+HERE
+
+}
+
+Loop through grn_names and layers
+for grn_name in "${grn_names[@]}"; do
+  append_entry "$grn_name" 
+done
+
+## controls
+append_entry_control "negative_control" "" "" "false" "false" "false"
+append_entry_control "positive_control" "" "" "false" "false" "false"
+append_entry_control "baseline_pearson" "false" "pearson" "false" "false" "false"
+append_entry_control "baseline_dotproduct" "false" "dotproduct" "false" "false" "false"
+append_entry_control "baseline_dotproduct_causal" "true" "dotproduct" "false" "false" "false"
+append_entry_control "baseline_dotproduct_causal_cell_type" "true" "dotproduct" "true" "false" "false"
+append_entry_control "baseline_dotproduct_causal_metacell" "true" "dotproduct" "false" "true" "false"
+append_entry_control "baseline_dotproduct_causal_impute" "true" "dotproduct" "false" "false" "true"
+append_entry_control "baseline_corr_causal_spearman" "true" "spearman"
+
+
+# Append the remaining output_state and publish_dir to the YAML file
+cat >> $param_file << HERE
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+# nextflow run . \
+#   -main-script  target/nextflow/workflows/run_grn_evaluation/main.nf \
+#   -profile docker \
+#   -with-trace \
+#   -c src/common/nextflow_helpers/labels_ci.config \
+#   -params-file ${param_file}
+# subl resources/results/grn_evaluation_all_ridge/scores.yaml
+
+# ./tw-windows-x86_64.exe launch `
+#     https://github.com/openproblems-bio/task_grn_inference.git `
+#     --revision build/main `
+#     --pull-latest `
+#     --main-script target/nextflow/workflows/run_grn_evaluation/main.nf `
+#     --workspace 53907369739130 `
+#     --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
+#     --params-file ./params/grn_evaluation_so_ridge.yaml `
+#     --config src/common/nextflow_helpers/labels_tw.config
+
+
diff --git a/scripts/run_baselines.sh b/scripts/run_baselines.sh
@@ -0,0 +1,55 @@
+echo  "baseline pearson"
+viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
+  --tf_all resources/prior/tf_all.csv \
+  --causal false \
+  --corr_method pearson \
+  --cell_type_specific  false \
+  --metacell  false \
+  --impute false \
+  --prediction resources/grn_models/baselines/baseline_pearson.csv 
+
+echo  "baseline dotproduct"
+viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
+  --tf_all resources/prior/tf_all.csv \
+  --causal false \
+  --corr_method dotproduct \
+  --cell_type_specific  false \
+  --metacell  false \
+  --impute false \
+  --prediction resources/grn_models/baselines/baseline_dotproduct.csv 
+
+echo  "baseline dotproduct causal"
+viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
+  --tf_all resources/prior/tf_all.csv \
+  --causal true \
+  --corr_method dotproduct \
+  --cell_type_specific  false \
+  --metacell  false \
+  --impute false \
+  --prediction resources/grn_models/baselines/baseline_dotproduct_causal.csv 
+
+echo  "baseline causal cell type"
+viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
+  --tf_all resources/prior/tf_all.csv \
+  --causal true \
+  --corr_method dotproduct \
+  --cell_type_specific  true \
+  --metacell  false \
+  --impute false \
+  --prediction resources/grn_models/baselines/baseline_dotproduct_causal_celltype.csv 
+
+echo  "baseline dotproduct causal metacell"
+viash run src/control_methods/baseline_corr/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
+  --tf_all resources/prior/tf_all.csv \
+  --causal true \
+  --corr_method dotproduct \
+  --cell_type_specific  false \
+  --metacell  true \
+  --impute false \
+  --prediction resources/grn_models/baselines/baseline_dotproduct_causal_metacell.csv 
+
+echo  "positive control"
+viash run src/control_methods/positive_control/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
+  --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
+  --tf_all resources/prior/tf_all.csv \
+  --prediction resources/grn_models/baselines/positive_control.csv 
diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh
@@ -6,15 +6,15 @@ viash ns build --parallel
 reg_type=ridge
 
 RUN_ID="grn_evaluation_all_${reg_type}"
-resources_dir="s3://openproblems-data/resources/grn"
-# resources_dir="./resources"
+# resources_dir="s3://openproblems-data/resources/grn"
+resources_dir="./resources"
 publish_dir="${resources_dir}/results/${RUN_ID}"
 grn_models_folder="${resources_dir}/grn_models"
 
 subsample=-2
 max_workers=10
 layer=scgen_pearson
-metric_ids="[regression_1, regression_2]"
+metric_ids="[regression_1]"
 
 param_file="./params/${RUN_ID}.yaml"
 
@@ -30,6 +30,15 @@ grn_names=(
     "ppcor"
     "portia"
     )
+
+baseline_models=(
+    baseline_pearson
+    baseline_dotproduct
+    baseline_dotproduct_causal
+    baseline_dotproduct_causal_celltype
+    baseline_dotproduct_causal_metacell
+    positive_control
+    )
 # Start writing to the YAML file
 cat > $param_file << HERE
 param_list:
@@ -48,63 +57,36 @@ append_entry() {
     tf_all: ${resources_dir}/prior/tf_all.csv
     layer: ${layer}
     consensus: ${resources_dir}/prior/consensus-num-regulators.json
-    prediction: ${grn_models_folder}/$1.csv
+    prediction: ${2}/$1.csv
 HERE
 }
 
-append_entry_control() {
-  cat >> $param_file << HERE
-  - id: ${reg_type}_${1}
-    metric_ids: ${metric_ids}
-    perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
-    multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad
-    reg_type: $reg_type
-    method_id: $1
-    subsample: $subsample
-    max_workers: $max_workers
-    tf_all: ${resources_dir}/prior/tf_all.csv
-    layer: ${layer}
-    consensus: ${resources_dir}/prior/consensus-num-regulators.json
-    causal: ${2}
-    corr_method: ${3}
-    prediction: ${resources_dir}/grn_models/collectri.csv
-    cell_type_specific:  ${4}
-    metacell:  ${5}
-    impute: ${6}
-HERE
 
-}
+# folder=${grn_models_folder}
+# # Loop through grn_names and layers
+# for grn_name in "${grn_names[@]}"; do
+#   append_entry "$grn_name"  "$folder"
+# done
 
-Loop through grn_names and layers
-for grn_name in "${grn_names[@]}"; do
-  append_entry "$grn_name" 
+folder=${grn_models_folder}/baselines
+for grn_name in "${baseline_models[@]}"; do
+  append_entry "$grn_name" "$folder" 
 done
 
-## controls
-append_entry_control "negative_control" "" "" "false" "false" "false"
-append_entry_control "positive_control" "" "" "false" "false" "false"
-append_entry_control "baseline_pearson" "false" "pearson" "false" "false" "false"
-append_entry_control "baseline_dotproduct" "false" "dotproduct" "false" "false" "false"
-append_entry_control "baseline_dotproduct_causal" "true" "dotproduct" "false" "false" "false"
-append_entry_control "baseline_dotproduct_causal_cell_type" "true" "dotproduct" "true" "false" "false"
-append_entry_control "baseline_dotproduct_causal_metacell" "true" "dotproduct" "false" "true" "false"
-append_entry_control "baseline_dotproduct_causal_impute" "true" "dotproduct" "false" "false" "true"
-append_entry_control "baseline_corr_causal_spearman" "true" "spearman"
-
 
 # Append the remaining output_state and publish_dir to the YAML file
 cat >> $param_file << HERE
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
-# nextflow run . \
-#   -main-script  target/nextflow/workflows/run_grn_evaluation/main.nf \
-#   -profile docker \
-#   -with-trace \
-#   -c src/common/nextflow_helpers/labels_ci.config \
-#   -params-file ${param_file}
-# subl resources/results/grn_evaluation_all_ridge/scores.yaml
+nextflow run . \
+  -main-script  target/nextflow/workflows/run_grn_evaluation/main.nf \
+  -profile docker \
+  -with-trace \
+  -c src/common/nextflow_helpers/labels_ci.config \
+  -params-file ${param_file}
+subl resources/results/grn_evaluation_all_ridge/scores.yaml
 
 # ./tw-windows-x86_64.exe launch `
 #     https://github.com/openproblems-bio/task_grn_inference.git `

diff --git a/src/control_methods/baseline_corr/script.py b/src/control_methods/baseline_corr/script.py
@@ -5,6 +5,7 @@
 import scanpy as sc
 from tqdm import tqdm
 from scipy.stats import spearmanr
+from sklearn.preprocessing import StandardScaler
 
 ## VIASH START
 par = {
@@ -14,7 +15,8 @@
 
 ## VIASH END
 
-def select_top_links(net, par):
+def process_links(net, par):
+    net = net[net.source!=net.target]
     net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
     net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
     return net
@@ -24,9 +26,11 @@ def create_corr_net(X: np.ndarray, groups: np.ndarray, method="pearson"):
     for group in tqdm(np.unique(groups), desc="Processing groups"):
         X_sub = X[groups == group, :]
         if method == "dotproduct":
-            net = X_sub.T.dot(X_sub)
+            X_sub = StandardScaler().fit_transform(X_sub)
+            net = np.dot(X_sub.T, X_sub) / X_sub.shape[0]
         elif method == "pearson":
             net = np.corrcoef(X_sub.T)
+            # net = pd.DataFrame(X_sub).transpose().corr().values.to_numpy()
             net = np.nan_to_num(net, nan=0.0, posinf=0.0, neginf=0.0)
         elif method == "spearman":
             net = spearmanr(X_sub).statistic
@@ -41,7 +45,7 @@ def create_corr_net(X: np.ndarray, groups: np.ndarray, method="pearson"):
         net = net.reset_index().melt(id_vars='index', var_name='source', value_name='weight')
         net.rename(columns={'index': 'target'}, inplace=True)
 
-        net = select_top_links(net, par)
+        net = process_links(net, par)
         net['cell_type'] = group
         if i==0:
             grn = net
@@ -53,8 +57,7 @@ def create_corr_net(X: np.ndarray, groups: np.ndarray, method="pearson"):
     if par['cell_type_specific']==False:
         grn.drop(columns=['cell_type'], inplace=True)
         grn = grn.groupby(['source', 'target']).mean().reset_index()
-        net = select_top_links(net, par)
-
+        grn = process_links(grn, par)        
     return grn
 print('Read data')
 multiomics_rna = ad.read_h5ad(par["multiomics_rna"])

diff --git a/src/control_methods/positive_control/config.vsh.yaml b/src/control_methods/positive_control/config.vsh.yaml
@@ -13,6 +13,11 @@ functionality:
       required: true
       direction: input
       example: resources_test/grn-benchmark/perturbation_data.h5ad
+    - name: --layer
+      type: string
+      direction: input
+      required: false
+      default: scgen_pearson
 
   resources:
     - type: python_script

diff --git a/src/control_methods/positive_control/script.py b/src/control_methods/positive_control/script.py
@@ -42,6 +42,14 @@ def create_positive_control(X: np.ndarray, groups: np.ndarray):
 
 pivoted_net = pivoted_net.rename(columns={'index': 'target'})
 pivoted_net = pivoted_net[pivoted_net['weight'] != 0]
+
+
+def process_links(net, par):
+    net = net[net.source!=net.target]
+    net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
+    net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
+    return net
+pivoted_net = process_links(pivoted_net, par)
 print('Saving')
 pivoted_net.to_csv(par["prediction"])
 
diff --git a/src/exp_analysis/test.sh b/src/exp_analysis/test.sh
@@ -1,4 +1,4 @@
 viash run src/exp_analysis/config.vsh.yaml -- \
     --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
-    --prediction output/baseline_corr.csv \
+    --prediction resources/grn_models/genie3.csv \