From ba24f2e7fcaa3c947a1b5430fd9aa7a7fd858d42 Mon Sep 17 00:00:00 2001 From: jalil Date: Sun, 11 Aug 2024 10:40:15 +0200 Subject: [PATCH] bug in workflow run grn evaluation fixed --- params/celloracle_test.yaml | 2 +- params/process_perturbation.yaml | 6 + params/subsample_200_ridge.yaml | 273 ++++++++++++++++++ scripts/run_grn_evaluation_tw.sh | 40 ++- scripts/run_grn_inference.sh | 4 +- scripts/run_process_perturbation_tw.sh | 4 +- src/methods/pycistopic/run.sh | 2 +- src/methods/pycistopic/script.py | 17 +- .../run_grn_evaluation/config.vsh.yaml | 1 - src/workflows/run_grn_evaluation/main.nf | 1 + .../run_grn_inference/config.vsh.yaml | 2 +- src/workflows/run_grn_inference/main.nf | 4 +- 12 files changed, 323 insertions(+), 33 deletions(-) create mode 100644 params/process_perturbation.yaml create mode 100644 params/subsample_200_ridge.yaml diff --git a/params/celloracle_test.yaml b/params/celloracle_test.yaml index 75b3ab81b..1ba267c3f 100644 --- a/params/celloracle_test.yaml +++ b/params/celloracle_test.yaml @@ -5,4 +5,4 @@ param_list: num_workers: 20 temp_dir: ./tmp/celloracle output_state: "state.yaml" -publish_dir: "s3://openproblems-data/resources/grn/results/celloracle_test" +publish_dir: "./output/celloracle_test" diff --git a/params/process_perturbation.yaml b/params/process_perturbation.yaml new file mode 100644 index 000000000..684c02990 --- /dev/null +++ b/params/process_perturbation.yaml @@ -0,0 +1,6 @@ +param_list: + - id: test_process_perturatbion + perturbation_counts: s3://openproblems-data/resources/grn/datasets_raw/perturbation_counts.h5ad, + +output_state: "state.yaml" +publish_dir: "s3://openproblems-data/resources/grn/results/process_perturbation" diff --git a/params/subsample_200_ridge.yaml b/params/subsample_200_ridge.yaml new file mode 100644 index 000000000..8c53832f2 --- /dev/null +++ b/params/subsample_200_ridge.yaml @@ -0,0 +1,273 @@ +param_list: + - id: pearson_celloracle + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: pearson + prediction: s3://openproblems-data/resources/grn/grn_models/celloracle.csv + reg_type: ridge + method_id: celloracle + subsample: 200 + max_workers: 20 + + - id: lognorm_celloracle + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/celloracle.csv + reg_type: ridge + method_id: celloracle + subsample: 200 + max_workers: 20 + + - id: scgen_pearson_celloracle + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/celloracle.csv + reg_type: ridge + method_id: celloracle + subsample: 200 + max_workers: 20 + + - id: scgen_lognorm_celloracle + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/celloracle.csv + reg_type: ridge + method_id: celloracle + subsample: 200 + max_workers: 20 + + - id: seurat_pearson_celloracle + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/celloracle.csv + reg_type: ridge + method_id: celloracle + subsample: 200 + max_workers: 20 + + - id: seurat_lognorm_celloracle + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/celloracle.csv + reg_type: ridge + method_id: celloracle + subsample: 200 + max_workers: 20 + + - id: pearson_scenicplus + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: pearson + prediction: s3://openproblems-data/resources/grn/grn_models/scenicplus.csv + reg_type: ridge + method_id: scenicplus + subsample: 200 + max_workers: 20 + + - id: lognorm_scenicplus + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/scenicplus.csv + reg_type: ridge + method_id: scenicplus + subsample: 200 + max_workers: 20 + + - id: scgen_pearson_scenicplus + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/scenicplus.csv + reg_type: ridge + method_id: scenicplus + subsample: 200 + max_workers: 20 + + - id: scgen_lognorm_scenicplus + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/scenicplus.csv + reg_type: ridge + method_id: scenicplus + subsample: 200 + max_workers: 20 + + - id: seurat_pearson_scenicplus + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/scenicplus.csv + reg_type: ridge + method_id: scenicplus + subsample: 200 + max_workers: 20 + + - id: seurat_lognorm_scenicplus + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/scenicplus.csv + reg_type: ridge + method_id: scenicplus + subsample: 200 + max_workers: 20 + + - id: pearson_figr + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: pearson + prediction: s3://openproblems-data/resources/grn/grn_models/figr.csv + reg_type: ridge + method_id: figr + subsample: 200 + max_workers: 20 + + - id: lognorm_figr + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/figr.csv + reg_type: ridge + method_id: figr + subsample: 200 + max_workers: 20 + + - id: scgen_pearson_figr + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/figr.csv + reg_type: ridge + method_id: figr + subsample: 200 + max_workers: 20 + + - id: scgen_lognorm_figr + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/figr.csv + reg_type: ridge + method_id: figr + subsample: 200 + max_workers: 20 + + - id: seurat_pearson_figr + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/figr.csv + reg_type: ridge + method_id: figr + subsample: 200 + max_workers: 20 + + - id: seurat_lognorm_figr + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/figr.csv + reg_type: ridge + method_id: figr + subsample: 200 + max_workers: 20 + + - id: pearson_granie + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: pearson + prediction: s3://openproblems-data/resources/grn/grn_models/granie.csv + reg_type: ridge + method_id: granie + subsample: 200 + max_workers: 20 + + - id: lognorm_granie + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/granie.csv + reg_type: ridge + method_id: granie + subsample: 200 + max_workers: 20 + + - id: scgen_pearson_granie + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/granie.csv + reg_type: ridge + method_id: granie + subsample: 200 + max_workers: 20 + + - id: scgen_lognorm_granie + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/granie.csv + reg_type: ridge + method_id: granie + subsample: 200 + max_workers: 20 + + - id: seurat_pearson_granie + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/granie.csv + reg_type: ridge + method_id: granie + subsample: 200 + max_workers: 20 + + - id: seurat_lognorm_granie + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/granie.csv + reg_type: ridge + method_id: granie + subsample: 200 + max_workers: 20 + + - id: pearson_scglue + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: pearson + prediction: s3://openproblems-data/resources/grn/grn_models/scglue.csv + reg_type: ridge + method_id: scglue + subsample: 200 + max_workers: 20 + + - id: lognorm_scglue + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/scglue.csv + reg_type: ridge + method_id: scglue + subsample: 200 + max_workers: 20 + + - id: scgen_pearson_scglue + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/scglue.csv + reg_type: ridge + method_id: scglue + subsample: 200 + max_workers: 20 + + - id: scgen_lognorm_scglue + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: scgen_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/scglue.csv + reg_type: ridge + method_id: scglue + subsample: 200 + max_workers: 20 + + - id: seurat_pearson_scglue + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_pearson + prediction: s3://openproblems-data/resources/grn/grn_models/scglue.csv + reg_type: ridge + method_id: scglue + subsample: 200 + max_workers: 20 + + - id: seurat_lognorm_scglue + perturbation_data: s3://openproblems-data/resources/grn/grn-benchmark/perturbation_data.h5ad + layer: seurat_lognorm + prediction: s3://openproblems-data/resources/grn/grn_models/scglue.csv + reg_type: ridge + method_id: scglue + subsample: 200 + max_workers: 20 + +output_state: "state.yaml" +publish_dir: "s3://openproblems-data/resources/grn/results/subsample_200_ridge" diff --git a/scripts/run_grn_evaluation_tw.sh b/scripts/run_grn_evaluation_tw.sh index f1e9670aa..e8b7df974 100644 --- a/scripts/run_grn_evaluation_tw.sh +++ b/scripts/run_grn_evaluation_tw.sh @@ -4,14 +4,15 @@ RUN_ID="subsample_200_ridge" -resources_dir="s3://openproblems-data/resources/grn/" +resources_dir="s3://openproblems-data/resources/grn" # resources_dir="resources/" - publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}" reg_type=ridge subsample=200 max_workers=20 +param_file="./params/${RUN_ID}.yaml" + grn_names=( "celloracle" "scenicplus" @@ -22,14 +23,14 @@ grn_names=( layers=("pearson" "lognorm" "scgen_pearson" "scgen_lognorm" "seurat_pearson" "seurat_lognorm") # Start writing to the YAML file -cat > ./params/params_${RUN_ID}.yaml << HERE +cat > $param_file << HERE param_list: HERE # Nested loops to iterate over grn_names and layers for grn_name in "${grn_names[@]}"; do for layer in "${layers[@]}"; do - cat >> ./params/params_${RUN_ID}.yaml << HERE + cat >> $param_file << HERE - id: ${layer}_${grn_name} perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad layer: ${layer} @@ -46,7 +47,7 @@ done # append negative control # grn_name="negative_control" -# cat >> ./params/params_${RUN_ID}.yaml << HERE +# cat >> $param_file << HERE # - id: ${layer}_${grn_name} # perturbation_data: ${perturbation_data} # layer: ${layer} @@ -61,7 +62,7 @@ done # # append the positive controls # grn_name="positive_control" # for layer in "${layers[@]}"; do -# cat >> ./params/params_${RUN_ID}.yaml << HERE +# cat >> $param_file << HERE # - id: ${layer}_${grn_name} # perturbation_data: ${perturbation_data} # layer: ${layer} @@ -75,18 +76,25 @@ done # done # Append the remaining output_state and publish_dir to the YAML file -cat >> ./params/params_${RUN_ID}.yaml << HERE +cat >> $param_file << HERE output_state: "state.yaml" publish_dir: "$publish_dir" HERE -# nextflow run . \ -# -main-script target/nextflow/workflows/run_grn_evaluation/main.nf \ -# -profile docker \ -# -with-trace \ -# -c src/common/nextflow_helpers/labels_ci.config \ -# -params-file ./params/params_${RUN_ID}.yaml - - -# ./tw-windows-x86_64.exe launch https://github.com/openproblems-bio/task_grn_benchmark.git --revision build/main --pull-latest --main-script target/nextflow/workflows/run_grn_evaluation/main.nf --workspace 53907369739130 --compute-env 6TeIFgV5OY4pJCk8I0bfOh --params-file ./params/params.yaml --config src/common/nextflow_helpers/labels_tw.config \ No newline at end of file +nextflow run . \ + -main-script target/nextflow/workflows/run_grn_evaluation/main.nf \ + -profile docker \ + -with-trace \ + -c src/common/nextflow_helpers/labels_ci.config \ + -params-file ${param_file} + +# ./tw-windows-x86_64.exe launch ` +# https://github.com/openproblems-bio/task_grn_benchmark.git ` +# --revision build/main ` +# --pull-latest ` +# --main-script target/nextflow/workflows/run_grn_evaluation/main.nf ` +# --workspace 53907369739130 ` +# --compute-env 6TeIFgV5OY4pJCk8I0bfOh ` +# --params-file ./params/subsample_200_ridge.yaml ` +# --config src/common/nextflow_helpers/labels_tw.config diff --git a/scripts/run_grn_inference.sh b/scripts/run_grn_inference.sh index 9896656cf..e2ed85674 100644 --- a/scripts/run_grn_inference.sh +++ b/scripts/run_grn_inference.sh @@ -5,8 +5,10 @@ RUN_ID="celloracle_test" # resources_dir="s3://openproblems-data/resources_test/grn/" +# publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}" + resources_dir="./resources_test" -publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}" +publish_dir="./output/${RUN_ID}" num_workers=20 diff --git a/scripts/run_process_perturbation_tw.sh b/scripts/run_process_perturbation_tw.sh index 1ec728b83..51c0e4815 100644 --- a/scripts/run_process_perturbation_tw.sh +++ b/scripts/run_process_perturbation_tw.sh @@ -7,7 +7,7 @@ publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}" cat > ./params/${RUN_ID}.yaml << HERE param_list: - id: test_process_perturatbion - perturbation_counts: "$resources_dir/datasets_raw/perturbation_counts.h5ad", + perturbation_counts: $resources_dir/datasets_raw/perturbation_counts.h5ad, output_state: "state.yaml" publish_dir: "$publish_dir" @@ -24,4 +24,4 @@ HERE -# ./tw-windows-x86_64.exe launch openproblems-bio/task_grn_benchmark --revision build/main --pull-latest --main-script target/nextflow/workflows/process_perturbation/main.nf --workspace 53907369739130 --compute-env 6TeIFgV5OY4pJCk8I0bfOh --params-file /tmp/params.yaml --config src/common/nextflow_helpers/labels_tw.config + ./tw-windows-x86_64.exe launch https://github.com/openproblems-bio/task_grn_benchmark.git --revision build/main --pull-latest --main-script target/nextflow/workflows/process_perturbation/main.nf --workspace 53907369739130 --compute-env 6TeIFgV5OY4pJCk8I0bfOh --params-file ./params/process_perturbation.yaml --config src/common/nextflow_helpers/labels_tw.config \ No newline at end of file diff --git a/src/methods/pycistopic/run.sh b/src/methods/pycistopic/run.sh index d80a2aec8..60720b71b 100644 --- a/src/methods/pycistopic/run.sh +++ b/src/methods/pycistopic/run.sh @@ -1,3 +1,3 @@ viash run src/methods/pycistopic/config.vsh.yaml -- --multiomics_atac resources_test/grn-benchmark/multiomics_atac.h5ad \ - --temp_dir output/pycistopic \ No newline at end of file + --temp_dir output/pycistopic --num_workers 10 \ No newline at end of file diff --git a/src/methods/pycistopic/script.py b/src/methods/pycistopic/script.py index 07870d15f..c116b5604 100644 --- a/src/methods/pycistopic/script.py +++ b/src/methods/pycistopic/script.py @@ -34,6 +34,7 @@ par = { 'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad', 'temp_dir': 'output/pycistopic', + 'num_workers': 10, 'qc': False } ## VIASH END @@ -151,7 +152,7 @@ bed_path=os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bed_files'), bigwig_path=os.path.join(out_dir, 'consensus_peak_calling/pseudobulk_bw_files'), path_to_fragments=fragments_dict, - n_cpu=10, + n_cpu=num_workers, temp_dir=os.path.join(out_dir, 'consensus_peak_calling/tmp'), split_pattern='-', ) @@ -172,7 +173,7 @@ bed_paths=bed_paths, outdir=os.path.join(os.path.join(out_dir, 'consensus_peak_calling/MACS')), genome_size='hs', - n_cpu=10, + n_cpu=num_workers, input_format='BEDPE', shift=73, ext_size=146, @@ -267,7 +268,7 @@ path_to_blacklist=os.path.join(out_dir, 'hg38-blacklist.v2.bed'), metrics=sample_metrics, valid_bc=sample_id_to_barcodes_passing_filters[sample_id], - n_cpu=10, + n_cpu=num_workers, project=donor_id, split_pattern='-' ) @@ -282,7 +283,7 @@ path_to_fragments=fragments_dict[donor_id], path_to_regions=os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), path_to_blacklist=os.path.join(out_dir, 'hg38-blacklist.v2.bed'), - n_cpu=10, + n_cpu=num_workers, project=donor_id, split_pattern='-' ) @@ -340,7 +341,7 @@ models = run_cgs_models_mallet( cistopic_obj_list[i], n_topics=n_topics, - n_cpu=12, + n_cpu=num_workers, n_iter=500, random_state=555, alpha=50, @@ -354,7 +355,7 @@ models = run_cgs_models( cistopic_obj_list[i], n_topics=n_topics, - n_cpu=12, + n_cpu=num_workers, n_iter=500, random_state=555, alpha=50, @@ -435,7 +436,7 @@ contrasts=None, adjpval_thr=0.05, log2fc_thr=np.log2(1.5), - n_cpu=5, + n_cpu=num_workers, split_pattern='-' ) @@ -527,7 +528,7 @@ contrasts=None, adjpval_thr=0.05, log2fc_thr=np.log2(1.5), - n_cpu=5, + n_cpu=num_workers, split_pattern='-' ) diff --git a/src/workflows/run_grn_evaluation/config.vsh.yaml b/src/workflows/run_grn_evaluation/config.vsh.yaml index 782446de5..7cac49cc4 100644 --- a/src/workflows/run_grn_evaluation/config.vsh.yaml +++ b/src/workflows/run_grn_evaluation/config.vsh.yaml @@ -11,7 +11,6 @@ functionality: arguments: - name: --perturbation_data type: file - must_exist: True direction: input - name: --layer type: string diff --git a/src/workflows/run_grn_evaluation/main.nf b/src/workflows/run_grn_evaluation/main.nf index e2c2ca5e8..3bb5dc4ab 100644 --- a/src/workflows/run_grn_evaluation/main.nf +++ b/src/workflows/run_grn_evaluation/main.nf @@ -32,6 +32,7 @@ workflow run_wf { }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ + perturbation_data: "perturbation_data", layer: "layer", prediction: "prediction", subsample: "subsample", diff --git a/src/workflows/run_grn_inference/config.vsh.yaml b/src/workflows/run_grn_inference/config.vsh.yaml index b3d4fd08c..d4c86589e 100644 --- a/src/workflows/run_grn_inference/config.vsh.yaml +++ b/src/workflows/run_grn_inference/config.vsh.yaml @@ -32,7 +32,7 @@ functionality: - name: Outputs arguments: - - name: --prediction_celloracle + - name: --prediction __merge__: ../../api/file_prediction.yaml required: true direction: output diff --git a/src/workflows/run_grn_inference/main.nf b/src/workflows/run_grn_inference/main.nf index 86e53b758..5d36bff25 100644 --- a/src/workflows/run_grn_inference/main.nf +++ b/src/workflows/run_grn_inference/main.nf @@ -11,9 +11,9 @@ workflow run_wf { temp_dir: "temp_dir", num_workers: "num_workers" ], - toState: [prediction:"prediction_celloracle"] + toState: [prediction:"prediction"] ) - | setState(["prediction_celloracle"]) + | setState(["prediction"]) emit: output_ch