run evaluation script is added

openproblems-bio · Aug 8, 2024 · f7b438a · f7b438a
1 parent c663e06
commit f7b438a
Show file tree

Hide file tree

Showing 12 changed files with 199 additions and 56 deletions.
diff --git a/params/params.yaml b/params/params.yaml
@@ -3,4 +3,4 @@ param_list:
     perturbation_counts: "s3://openproblems-data/resources/grn/datasets_raw/perturbation_counts.h5ad"
 
 output_state: "state.yaml"
-publish_dir: "s3://openproblems-data/resources/grn/results/process_perturbation/run_2"
+publish_dir: "s3://openproblems-data/resources/grn/results/process_perturbation/run_4"
diff --git a/scripts/run_evaluation.sh b/scripts/run_evaluation.sh
@@ -1,47 +1,48 @@
 #!/bin/bash
 
 # Default values
-grn=""
-sample="200"  # Default value for sample
+grn_name="test"
+subsample="200"  # Default value for sample
 reg_type="ridge"
 score="out/score.csv"
+layer="pearson"
+max_workers="4"
 
 # Parse arguments
 while [[ "$#" -gt 0 ]]; do
     case $1 in
-        --grn) grn="$2"; shift ;;
-        --sample) sample="$2"; shift ;;
+        --grn_model) grn_model="$2"; shift ;;
+        --grn_name) grn_name="$2"; shift ;;
+        --subsample) subsample="$2"; shift ;;
         --reg_type) reg_type="$2"; shift ;;
         --score) score="$2"; shift ;;
+        --layer) layer="$2"; shift ;;
+        --max_workers) max_workers="$2"; shift ;;
         *) echo "Unknown parameter passed: $1"; exit 1 ;;
     esac
     shift
 done
-
 # Ensure required arguments are provided
-if [ -z "$grn" ]; then
-    echo "Usage: $0 --grn <grn_file> [--sample <sample_value>]"
+if [ -z "$grn_model"  ]; then
+    echo "Usage: $0 --grn_model <grn_file> [--subsample <sample_value>]"
     exit 1
 fi
 
 # Print parsed arguments (for debugging purposes)
-echo "GRN file: $grn"
-echo "Sample value: $sample"
-echo "Regression model: $reg_type"
+echo "GRN name: $grn_name   Regression model: $reg_type    Samples: $subsample     layer: $layer     max_workers: $max_workers"
 
-# Clean bin/ folder
-rm -r bin
-mkdir bin
 
-# Run regression analysis 1
-echo "Running GRN benchmark with $grn and sample size $sample"
+# # Run regression analysis 1
 echo "Regression 1"
-mkdir -p bin/regression_1
-viash build src/metrics/regression_1/config.vsh.yaml -p docker -o bin/regression_1
-bin/regression_1/regression_1 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score
+score_reg1="${score/.csv/_reg1.csv}"
+viash run src/metrics/regression_1/config.vsh.yaml -- --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
+    --reg_type $reg_type --prediction $grn_model --score $score_reg1 --layer $layer --subsample $subsample --max_workers $max_workers
 
-# Run regression analysis 2
+# # Run regression analysis 2
 echo "Regression 2"
-mkdir -p bin/regression_2
-viash build src/metrics/regression_2/config.vsh.yaml -p docker -o bin/regression_2
-bin/regression_2/regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score
+score_reg2="${score/.csv/_reg2.csv}"
+time viash run src/metrics/regression_1/config.vsh.yaml -- --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
+    --reg_type $reg_type --prediction $grn_model --score $score_reg2 --layer $layer --subsample $subsample --max_workers $max_workers
+
+
+paste -d ',' "$score_reg1" <(cut -d ',' -f2- "$score_reg2") > $score
diff --git a/scripts/run_evaluation_all.sh b/scripts/run_evaluation_all.sh
@@ -1,15 +1,56 @@
 #!/bin/bash
 
 models_folder="resources/grn-benchmark/grn_models"
-grn_models=(
+grn_names=(
     "celloracle"
     "scenicplus"
     "figr"
     "granie"
     "scglue"
+    "positive_control"
+    "negative_control"
 )
+reg_type="ridge" 
+subsample="2"
+max_workers="4"
+folder="out"
 
-# Loop through each GRN model and run the benchmark script
-for grn_model in "${grn_models[@]}"; do
-    bash scripts/run_evaluation_grn.sh --grn "$models_folder/$grn_model.csv" --sample 200 --reg_type ridge --score "out/$grn_model.csv"
+
+# layer="pearson"
+layers=("pearson" "lognorm" "scgen_pearson" "scgen_lognorm" "seurat_pearson" "seurat_lognorm")
+
+for layer in "${layers[@]}"; do
+    # Loop through each GRN model and run the benchmark script
+    layer_folder="${folder}/${layer}"
+    mkdir $layer_folder
+    for grn_name in "${grn_names[@]}"; do
+        grn_model="${models_folder}/${grn_name}.csv"
+        score="${layer_folder}/${grn_name}.csv"
+        bash scripts/run_evaluation.sh --grn_model "$grn_model" \
+            --grn_name "$grn_name" --subsample "$subsample" --reg_type "$reg_type" \
+            --score "$score" --max_workers "$max_workers" --layer "$layer"
+    done
 done
+
+# aggregate the scores across layers (axis=0)
+for grn_name in "${grn_names[@]}"; do
+    # Define a file to store the aggregated scores
+    aggregated_score="${folder}/${grn_name}.csv"
+
+    # Remove the file if it exists (to avoid appending to old data)
+    rm -f "$aggregated_score"
+
+    for layer in "${layers[@]}"; do
+        # Define the path to the score file for the current layer
+        layer_folder="${folder}/${layer}"
+        score="${layer_folder}/${grn_name}.csv"
+
+        # Append the score to the aggregated file
+        # Skip the header for all but the first file
+        if [ -s "$aggregated_score" ]; then
+            tail -n +2 "$score" >> "$aggregated_score"
+        else
+            cat "$score" > "$aggregated_score"
+        fi
+    done
+done
diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py
@@ -8,6 +8,8 @@
 from concurrent.futures import ThreadPoolExecutor
 import pandas as pd
 import anndata as ad
+from tqdm import tqdm
+
 import os
 
 
@@ -50,7 +52,7 @@ def cv_5(genes_n):
     return groups
 
 
-def run_method_1(
+def regression_1(
             net: pd.DataFrame, 
             train_df: pd.DataFrame,
             reg_type: str = 'GB',
@@ -85,7 +87,6 @@ def run_method_1(
     Y_df = train_df.loc[included_genes,:]
 
     mask_shared_genes = X_df.index.isin(net.index)
-    # print(X_df.shape, Y_df.shape)
 
     # fill the actuall regulatory links
     X_df.loc[mask_shared_genes, :] = net.values
@@ -99,38 +100,28 @@ def run_method_1(
     y_pred[:] = means
     y_pred = y_pred.values
 
-
     # initialize y_true
     Y = Y_df.values
     y_true = Y.copy()
 
     unique_groups = np.unique(groups)
-    
-    for group in unique_groups:
+
+    for group in tqdm(unique_groups, desc="Processing groups"):
         mask_va = groups == group
         mask_tr = ~mask_va
-
         # Use logical AND to combine masks correctly
         X_tr = X[mask_tr & mask_shared_genes, :]
         Y_tr = Y[mask_tr & mask_shared_genes, :]
 
         regr.fit(X_tr, Y_tr)
-
         y_pred[mask_va & mask_shared_genes, :] = regr.predict(X[mask_va & mask_shared_genes, :])
 
-
-    # assert ~(y_true==0).any()
-
-    # if verbose >= 1:
     score_r2  = r2_score(y_true, y_pred, multioutput='variance_weighted') #uniform_average', 'variance_weighted
-    # print(f'score_r2: ', score_r2)
-
 
     mean_score_r2 = r2_score(y_true, y_pred, multioutput='variance_weighted')
     # gene_scores_r2 = r2_score(y_true.T, y_pred.T, multioutput='raw_values')
 
     output = dict(mean_score_r2=mean_score_r2)
-
     return output
 
 def set_global_seed(seed):
@@ -198,6 +189,8 @@ def main(par):
     layer_results = {}  # Store results for this layer
     for exclude_missing_genes in [True, False]:  # two settings on target gene
         for tf_n in [-1, 140]:  # two settings on tfs
+            run_key = f'ex({exclude_missing_genes})_tf({tf_n})'
+            print(run_key)
             net_subset = net_processed.copy()
 
             # Subset TFs 
@@ -211,9 +204,9 @@ def main(par):
                 degrees = net_subset.abs().sum(axis=0)
                 net_subset = net_subset[degrees.nlargest(tf_n).index]
 
-            output = run_method_1(net_subset, pert_df, exclude_missing_genes=exclude_missing_genes, reg_type=reg_type, max_workers=max_workers)
-            result_key = f'ex({exclude_missing_genes})_tf({tf_n})'
-            layer_results[result_key] = [output['mean_score_r2']]
+            output = regression_1(net_subset, pert_df, exclude_missing_genes=exclude_missing_genes, reg_type=reg_type, max_workers=max_workers)
+
+            layer_results[run_key] = [output['mean_score_r2']]
 
     # Convert results to DataFrame
     df_results = pd.DataFrame(layer_results)
@@ -222,4 +215,6 @@ def main(par):
     if 'ex(False)_tf(140)' not in df_results.columns:
         df_results['ex(False)_tf(140)'] = df_results['ex(False)_tf(-1)']
 
+    df_results['Mean'] = df_results.mean(axis=1)
+
     return df_results
diff --git a/src/metrics/regression_1/plot_scores.ipynb b/src/metrics/regression_1/plot_scores.ipynb
diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py
@@ -15,15 +15,13 @@
 }
 
 ## VIASH END
-print('Reading input data')
-
 sys.path.append(meta["resources_dir"])
 from main import main 
 
 output = main(par) 
 print(output)
 # output.columns = ['S1', 'S2', 'S3', 'S4']
-
+output.index=[par["layer"]]
 print("Write output to file", flush=True)
 print(output)
 output.to_csv(par['score'])

diff --git a/src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml b/src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml
@@ -8,7 +8,7 @@ functionality:
     summary: "Correct batch effects using scgen"
 
   arguments:
-    - name: --perturbation_data_n
+    - name: --perturbation_data
       type: file
       info:
         label: perturbation

diff --git a/src/process_data/perturbation/batch_correction_scgen/script.py b/src/process_data/perturbation/batch_correction_scgen/script.py
@@ -6,13 +6,13 @@
 
 ## VIASH START
 par = {
-    'perturbation_data_n': 'resources/grn-benchmark/perturbation_data.h5ad',
+    'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
     "perturbation_data_bc": 'resources/grn-benchmark/perturbation_data.h5ad'
 }
 ## VIASH END
 batch_key = 'plate_name'
 label_key = 'cell_type'
-bulk_adata = ad.read_h5ad(par['perturbation_data_n'])
+bulk_adata = ad.read_h5ad(par['perturbation_data'])
 print(bulk_adata)
 
 for norm_name in ['lognorm', 'pearson']:

diff --git a/src/process_data/perturbation/normalization/config.vsh.yaml b/src/process_data/perturbation/normalization/config.vsh.yaml
@@ -10,7 +10,7 @@ functionality:
 
   arguments:
 
-    - name: --perturbation_data_f
+    - name: --pseudobulked_data_f
       type: file
       info:
         label: perturbation
@@ -26,8 +26,9 @@ functionality:
       required: false
       direction: input
       default: resources_local/pseudobulked_data_f
+      example: resources_test/grn-benchmark/perturbation_data.h5ad 
 
-    - name: --perturbation_data_n
+    - name: --perturbation_data
       type: file
       info:
         label: perturbation
@@ -50,6 +51,7 @@ functionality:
       required: false
       direction: input
       default: resources/grn-benchmark/perturbation_data.h5ad 
+      example: resources_test/grn-benchmark/perturbation_data.h5ad 
 
 
   resources:

diff --git a/src/process_data/perturbation/normalization/script.py b/src/process_data/perturbation/normalization/script.py
@@ -11,7 +11,7 @@
 ## VIASH START
 par = {
     'pseudobulked_data_f': 'resources/grn-benchmark/perturbation_data.h5ad',
-    'perturbation_data_n': 'resources/grn-benchmark/perturbation_data.h5ad'
+    'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad'
 }
 ## VIASH END
 
@@ -35,4 +35,4 @@ def normalize_func(bulk_adata):
 bulk_adata_n = normalize_func(bulk_adata_filtered)
 print("Normalizing completed")
 print("Writing the file")
-bulk_adata_n.write(par['perturbation_data_n'])
+bulk_adata_n.write(par['perturbation_data'])
diff --git a/src/process_data/perturbation/sc_counts/config.vsh.yaml b/src/process_data/perturbation/sc_counts/config.vsh.yaml
@@ -17,6 +17,7 @@ functionality:
       required: false
       direction: input
       default: resources/datasets_raw/perturbation_counts.h5ad 
+      example: resources_test/datasets_raw/perturbation_counts.h5ad 
 
     - name: --pseudobulked_data
       type: file
@@ -27,6 +28,7 @@ functionality:
       required: false
       direction: output
       default: resources_local/pseudobulked_data.h5ad
+      example: resources_test/grn-benchmark/perturbation_data.h5ad
 
     - name: --pseudobulked_data_f
       type: file
@@ -43,6 +45,7 @@ functionality:
       required: false
       direction: output
       default: resources_local/pseudobulked_data_f.h5ad 
+      example: resources_test/grn-benchmark/perturbation_data.h5ad
 
 
   resources:

diff --git a/src/workflows/process_perturbation/main.nf b/src/workflows/process_perturbation/main.nf
@@ -12,11 +12,11 @@ workflow run_wf {
 
     | normalization.run(
       fromState: [pseudobulked_data_f: "pseudobulked_data_f"],
-      toState: [perturbation_data_n: "perturbation_data_n"]
+      toState: [perturbation_data: "perturbation_data"]
     )
 
     | batch_correction_scgen.run(
-      fromState: [perturbation_data_n: "perturbation_data_n"],
+      fromState: [perturbation_data: "perturbation_data"],
       toState: [perturbation_data_bc: "perturbation_data_bc"]
     )