Skip to content

Commit

Permalink
run evaluation script is added
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Aug 8, 2024
1 parent c663e06 commit f7b438a
Show file tree
Hide file tree
Showing 12 changed files with 199 additions and 56 deletions.
2 changes: 1 addition & 1 deletion params/params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ param_list:
perturbation_counts: "s3://openproblems-data/resources/grn/datasets_raw/perturbation_counts.h5ad"

output_state: "state.yaml"
publish_dir: "s3://openproblems-data/resources/grn/results/process_perturbation/run_2"
publish_dir: "s3://openproblems-data/resources/grn/results/process_perturbation/run_4"
45 changes: 23 additions & 22 deletions scripts/run_evaluation.sh
Original file line number Diff line number Diff line change
@@ -1,47 +1,48 @@
#!/bin/bash

# Default values
grn=""
sample="200" # Default value for sample
grn_name="test"
subsample="200" # Default value for sample
reg_type="ridge"
score="out/score.csv"
layer="pearson"
max_workers="4"

# Parse arguments
while [[ "$#" -gt 0 ]]; do
case $1 in
--grn) grn="$2"; shift ;;
--sample) sample="$2"; shift ;;
--grn_model) grn_model="$2"; shift ;;
--grn_name) grn_name="$2"; shift ;;
--subsample) subsample="$2"; shift ;;
--reg_type) reg_type="$2"; shift ;;
--score) score="$2"; shift ;;
--layer) layer="$2"; shift ;;
--max_workers) max_workers="$2"; shift ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
shift
done

# Ensure required arguments are provided
if [ -z "$grn" ]; then
echo "Usage: $0 --grn <grn_file> [--sample <sample_value>]"
if [ -z "$grn_model" ]; then
echo "Usage: $0 --grn_model <grn_file> [--subsample <sample_value>]"
exit 1
fi

# Print parsed arguments (for debugging purposes)
echo "GRN file: $grn"
echo "Sample value: $sample"
echo "Regression model: $reg_type"
echo "GRN name: $grn_name Regression model: $reg_type Samples: $subsample layer: $layer max_workers: $max_workers"

# Clean bin/ folder
rm -r bin
mkdir bin

# Run regression analysis 1
echo "Running GRN benchmark with $grn and sample size $sample"
# # Run regression analysis 1
echo "Regression 1"
mkdir -p bin/regression_1
viash build src/metrics/regression_1/config.vsh.yaml -p docker -o bin/regression_1
bin/regression_1/regression_1 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score
score_reg1="${score/.csv/_reg1.csv}"
viash run src/metrics/regression_1/config.vsh.yaml -- --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
--reg_type $reg_type --prediction $grn_model --score $score_reg1 --layer $layer --subsample $subsample --max_workers $max_workers

# Run regression analysis 2
# # Run regression analysis 2
echo "Regression 2"
mkdir -p bin/regression_2
viash build src/metrics/regression_2/config.vsh.yaml -p docker -o bin/regression_2
bin/regression_2/regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score
score_reg2="${score/.csv/_reg2.csv}"
time viash run src/metrics/regression_1/config.vsh.yaml -- --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
--reg_type $reg_type --prediction $grn_model --score $score_reg2 --layer $layer --subsample $subsample --max_workers $max_workers


paste -d ',' "$score_reg1" <(cut -d ',' -f2- "$score_reg2") > $score
49 changes: 45 additions & 4 deletions scripts/run_evaluation_all.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,56 @@
#!/bin/bash

models_folder="resources/grn-benchmark/grn_models"
grn_models=(
grn_names=(
"celloracle"
"scenicplus"
"figr"
"granie"
"scglue"
"positive_control"
"negative_control"
)
reg_type="ridge"
subsample="2"
max_workers="4"
folder="out"

# Loop through each GRN model and run the benchmark script
for grn_model in "${grn_models[@]}"; do
bash scripts/run_evaluation_grn.sh --grn "$models_folder/$grn_model.csv" --sample 200 --reg_type ridge --score "out/$grn_model.csv"

# layer="pearson"
layers=("pearson" "lognorm" "scgen_pearson" "scgen_lognorm" "seurat_pearson" "seurat_lognorm")

for layer in "${layers[@]}"; do
# Loop through each GRN model and run the benchmark script
layer_folder="${folder}/${layer}"
mkdir $layer_folder
for grn_name in "${grn_names[@]}"; do
grn_model="${models_folder}/${grn_name}.csv"
score="${layer_folder}/${grn_name}.csv"
bash scripts/run_evaluation.sh --grn_model "$grn_model" \
--grn_name "$grn_name" --subsample "$subsample" --reg_type "$reg_type" \
--score "$score" --max_workers "$max_workers" --layer "$layer"
done
done

# aggregate the scores across layers (axis=0)
for grn_name in "${grn_names[@]}"; do
# Define a file to store the aggregated scores
aggregated_score="${folder}/${grn_name}.csv"

# Remove the file if it exists (to avoid appending to old data)
rm -f "$aggregated_score"

for layer in "${layers[@]}"; do
# Define the path to the score file for the current layer
layer_folder="${folder}/${layer}"
score="${layer_folder}/${grn_name}.csv"

# Append the score to the aggregated file
# Skip the header for all but the first file
if [ -s "$aggregated_score" ]; then
tail -n +2 "$score" >> "$aggregated_score"
else
cat "$score" > "$aggregated_score"
fi
done
done
29 changes: 12 additions & 17 deletions src/metrics/regression_1/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import anndata as ad
from tqdm import tqdm

import os


Expand Down Expand Up @@ -50,7 +52,7 @@ def cv_5(genes_n):
return groups


def run_method_1(
def regression_1(
net: pd.DataFrame,
train_df: pd.DataFrame,
reg_type: str = 'GB',
Expand Down Expand Up @@ -85,7 +87,6 @@ def run_method_1(
Y_df = train_df.loc[included_genes,:]

mask_shared_genes = X_df.index.isin(net.index)
# print(X_df.shape, Y_df.shape)

# fill the actuall regulatory links
X_df.loc[mask_shared_genes, :] = net.values
Expand All @@ -99,38 +100,28 @@ def run_method_1(
y_pred[:] = means
y_pred = y_pred.values


# initialize y_true
Y = Y_df.values
y_true = Y.copy()

unique_groups = np.unique(groups)
for group in unique_groups:

for group in tqdm(unique_groups, desc="Processing groups"):
mask_va = groups == group
mask_tr = ~mask_va

# Use logical AND to combine masks correctly
X_tr = X[mask_tr & mask_shared_genes, :]
Y_tr = Y[mask_tr & mask_shared_genes, :]

regr.fit(X_tr, Y_tr)

y_pred[mask_va & mask_shared_genes, :] = regr.predict(X[mask_va & mask_shared_genes, :])


# assert ~(y_true==0).any()

# if verbose >= 1:
score_r2 = r2_score(y_true, y_pred, multioutput='variance_weighted') #uniform_average', 'variance_weighted
# print(f'score_r2: ', score_r2)


mean_score_r2 = r2_score(y_true, y_pred, multioutput='variance_weighted')
# gene_scores_r2 = r2_score(y_true.T, y_pred.T, multioutput='raw_values')

output = dict(mean_score_r2=mean_score_r2)

return output

def set_global_seed(seed):
Expand Down Expand Up @@ -198,6 +189,8 @@ def main(par):
layer_results = {} # Store results for this layer
for exclude_missing_genes in [True, False]: # two settings on target gene
for tf_n in [-1, 140]: # two settings on tfs
run_key = f'ex({exclude_missing_genes})_tf({tf_n})'
print(run_key)
net_subset = net_processed.copy()

# Subset TFs
Expand All @@ -211,9 +204,9 @@ def main(par):
degrees = net_subset.abs().sum(axis=0)
net_subset = net_subset[degrees.nlargest(tf_n).index]

output = run_method_1(net_subset, pert_df, exclude_missing_genes=exclude_missing_genes, reg_type=reg_type, max_workers=max_workers)
result_key = f'ex({exclude_missing_genes})_tf({tf_n})'
layer_results[result_key] = [output['mean_score_r2']]
output = regression_1(net_subset, pert_df, exclude_missing_genes=exclude_missing_genes, reg_type=reg_type, max_workers=max_workers)

layer_results[run_key] = [output['mean_score_r2']]

# Convert results to DataFrame
df_results = pd.DataFrame(layer_results)
Expand All @@ -222,4 +215,6 @@ def main(par):
if 'ex(False)_tf(140)' not in df_results.columns:
df_results['ex(False)_tf(140)'] = df_results['ex(False)_tf(-1)']

df_results['Mean'] = df_results.mean(axis=1)

return df_results
103 changes: 103 additions & 0 deletions src/metrics/regression_1/plot_scores.ipynb

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions src/metrics/regression_1/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,13 @@
}

## VIASH END
print('Reading input data')

sys.path.append(meta["resources_dir"])
from main import main

output = main(par)
print(output)
# output.columns = ['S1', 'S2', 'S3', 'S4']

output.index=[par["layer"]]
print("Write output to file", flush=True)
print(output)
output.to_csv(par['score'])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ functionality:
summary: "Correct batch effects using scgen"

arguments:
- name: --perturbation_data_n
- name: --perturbation_data
type: file
info:
label: perturbation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@

## VIASH START
par = {
'perturbation_data_n': 'resources/grn-benchmark/perturbation_data.h5ad',
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
"perturbation_data_bc": 'resources/grn-benchmark/perturbation_data.h5ad'
}
## VIASH END
batch_key = 'plate_name'
label_key = 'cell_type'
bulk_adata = ad.read_h5ad(par['perturbation_data_n'])
bulk_adata = ad.read_h5ad(par['perturbation_data'])
print(bulk_adata)

for norm_name in ['lognorm', 'pearson']:
Expand Down
6 changes: 4 additions & 2 deletions src/process_data/perturbation/normalization/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ functionality:

arguments:

- name: --perturbation_data_f
- name: --pseudobulked_data_f
type: file
info:
label: perturbation
Expand All @@ -26,8 +26,9 @@ functionality:
required: false
direction: input
default: resources_local/pseudobulked_data_f
example: resources_test/grn-benchmark/perturbation_data.h5ad

- name: --perturbation_data_n
- name: --perturbation_data
type: file
info:
label: perturbation
Expand All @@ -50,6 +51,7 @@ functionality:
required: false
direction: input
default: resources/grn-benchmark/perturbation_data.h5ad
example: resources_test/grn-benchmark/perturbation_data.h5ad


resources:
Expand Down
4 changes: 2 additions & 2 deletions src/process_data/perturbation/normalization/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
## VIASH START
par = {
'pseudobulked_data_f': 'resources/grn-benchmark/perturbation_data.h5ad',
'perturbation_data_n': 'resources/grn-benchmark/perturbation_data.h5ad'
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad'
}
## VIASH END

Expand All @@ -35,4 +35,4 @@ def normalize_func(bulk_adata):
bulk_adata_n = normalize_func(bulk_adata_filtered)
print("Normalizing completed")
print("Writing the file")
bulk_adata_n.write(par['perturbation_data_n'])
bulk_adata_n.write(par['perturbation_data'])
3 changes: 3 additions & 0 deletions src/process_data/perturbation/sc_counts/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ functionality:
required: false
direction: input
default: resources/datasets_raw/perturbation_counts.h5ad
example: resources_test/datasets_raw/perturbation_counts.h5ad

- name: --pseudobulked_data
type: file
Expand All @@ -27,6 +28,7 @@ functionality:
required: false
direction: output
default: resources_local/pseudobulked_data.h5ad
example: resources_test/grn-benchmark/perturbation_data.h5ad

- name: --pseudobulked_data_f
type: file
Expand All @@ -43,6 +45,7 @@ functionality:
required: false
direction: output
default: resources_local/pseudobulked_data_f.h5ad
example: resources_test/grn-benchmark/perturbation_data.h5ad


resources:
Expand Down
4 changes: 2 additions & 2 deletions src/workflows/process_perturbation/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ workflow run_wf {

| normalization.run(
fromState: [pseudobulked_data_f: "pseudobulked_data_f"],
toState: [perturbation_data_n: "perturbation_data_n"]
toState: [perturbation_data: "perturbation_data"]
)

| batch_correction_scgen.run(
fromState: [perturbation_data_n: "perturbation_data_n"],
fromState: [perturbation_data: "perturbation_data"],
toState: [perturbation_data_bc: "perturbation_data_bc"]
)

Expand Down

0 comments on commit f7b438a

Please sign in to comment.