From bfc18adec47eaae225fd670ade192f8a5feea0f5 Mon Sep 17 00:00:00 2001 From: jalil Date: Sat, 5 Oct 2024 15:37:10 +0200 Subject: [PATCH] process multiomics workflow updated --- scripts/{ => repo}/extract_resources.sh | 0 scripts/{ => repo}/run_pc_vs_nc.sh | 0 scripts/{ => repo}/run_robust_analys.sh | 0 .../{ => repo}/run_robust_analys_causal.sh | 0 scripts/run_process_multiomics_dataset.sh | 29 ++++++++++++ scripts/run_process_perturbation_dataset.sh | 28 ++++++++++++ scripts/run_process_perturbation_tw.sh | 28 ------------ src/methods/multi_omics/figr/script.R | 1 + .../multiomics/batch_correction/script.py | 40 ----------------- .../multiomics/format_data/config.vsh.yaml | 15 +------ .../multiomics/format_data/script.py | 27 ++++------- .../format_resources_r/config.vsh.yaml | 33 ++++++-------- .../multiomics/format_resources_r/script.R | 1 + .../multiome_matrix/config.vsh.yaml | 21 +++++---- .../multiomics/subset_hvg/config.vsh.yaml | 45 +++++++++++++++++++ .../multiomics/subset_hvg/script.py | 29 ++++++++++++ .../process_multiomics/config.vsh.yaml | 41 +++++++++++++---- src/workflows/process_multiomics/main.nf | 28 +++++++++++- 18 files changed, 227 insertions(+), 139 deletions(-) rename scripts/{ => repo}/extract_resources.sh (100%) rename scripts/{ => repo}/run_pc_vs_nc.sh (100%) rename scripts/{ => repo}/run_robust_analys.sh (100%) rename scripts/{ => repo}/run_robust_analys_causal.sh (100%) create mode 100644 scripts/run_process_multiomics_dataset.sh create mode 100644 scripts/run_process_perturbation_dataset.sh delete mode 100644 scripts/run_process_perturbation_tw.sh delete mode 100644 src/process_data/multiomics/batch_correction/script.py create mode 100644 src/process_data/multiomics/subset_hvg/config.vsh.yaml create mode 100644 src/process_data/multiomics/subset_hvg/script.py diff --git a/scripts/extract_resources.sh b/scripts/repo/extract_resources.sh similarity index 100% rename from scripts/extract_resources.sh rename to scripts/repo/extract_resources.sh diff --git a/scripts/run_pc_vs_nc.sh b/scripts/repo/run_pc_vs_nc.sh similarity index 100% rename from scripts/run_pc_vs_nc.sh rename to scripts/repo/run_pc_vs_nc.sh diff --git a/scripts/run_robust_analys.sh b/scripts/repo/run_robust_analys.sh similarity index 100% rename from scripts/run_robust_analys.sh rename to scripts/repo/run_robust_analys.sh diff --git a/scripts/run_robust_analys_causal.sh b/scripts/repo/run_robust_analys_causal.sh similarity index 100% rename from scripts/run_robust_analys_causal.sh rename to scripts/repo/run_robust_analys_causal.sh diff --git a/scripts/run_process_multiomics_dataset.sh b/scripts/run_process_multiomics_dataset.sh new file mode 100644 index 000000000..d757dd309 --- /dev/null +++ b/scripts/run_process_multiomics_dataset.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +RUN_ID="process_multiomics" +# resources_dir="s3://openproblems-data/resources/grn/" +resources_dir="resources" +publish_dir="${resources_dir}/results/${RUN_ID}" + +cat > ./params/${RUN_ID}.yaml << HERE +param_list: + - id: process_multiomics + multiome_counts: $resources_dir/datasets_raw/multiome_counts.h5ad + +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + + +# ./tw-windows-x86_64.exe launch https://github.com/openproblems-bio/task_grn_inference.git ` +# --revision build/main --pull-latest ` +# --main-script target/nextflow/workflows/process_multiomics/main.nf ` +# --workspace 53907369739130 --compute-env 6TeIFgV5OY4pJCk8I0bfOh ` +# --params-file ./params/process_multiomics.yaml ` +# --config src/common/nextflow_helpers/labels_tw.config + + +nextflow run . \ + -main-script target/nextflow/workflows/process_multiomics/main.nf \ + -profile docker -with-trace -c src/common/nextflow_helpers/labels_ci.config \ + -params-file params/${RUN_ID}.yaml diff --git a/scripts/run_process_perturbation_dataset.sh b/scripts/run_process_perturbation_dataset.sh new file mode 100644 index 000000000..00a358bfb --- /dev/null +++ b/scripts/run_process_perturbation_dataset.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +RUN_ID="process_perturbation" +resources_dir="s3://openproblems-data/resources/grn/" +publish_dir="${resources_dir}/results/${RUN_ID}" + +cat > ./params/${RUN_ID}.yaml << HERE +param_list: + - id: test_process_perturatbion + perturbation_counts: $resources_dir/datasets_raw/perturbation_counts.h5ad + +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + + +# ./tw-windows-x86_64.exe launch https://github.com/openproblems-bio/task_grn_inference.git ` +# --revision build/main --pull-latest ` +# --main-script target/nextflow/workflows/process_perturbation/main.nf ` +# --workspace 53907369739130 --compute-env 6TeIFgV5OY4pJCk8I0bfOh ` +# --params-file ./params/process_perturbation.yaml ` +# --config src/common/nextflow_helpers/labels_tw.config + + +nextflow run . \ + -main-script target/nextflow/workflows/process_perturbation/main.nf \ + -profile docker -with-trace -c src/common/nextflow_helpers/labels_ci.config \ + -params-file params/${RUN_ID}.yaml diff --git a/scripts/run_process_perturbation_tw.sh b/scripts/run_process_perturbation_tw.sh deleted file mode 100644 index 5f92d1631..000000000 --- a/scripts/run_process_perturbation_tw.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -RUN_ID="process_perturbation" -resources_dir="s3://openproblems-data/resources/grn/" -publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}" - -cat > ./params/${RUN_ID}.yaml << HERE -param_list: - - id: test_process_perturatbion - perturbation_counts: $resources_dir/datasets_raw/perturbation_counts.h5ad - -output_state: "state.yaml" -publish_dir: "$publish_dir" -HERE - - - ./tw-windows-x86_64.exe launch https://github.com/openproblems-bio/task_grn_inference.git ` - --revision build/main --pull-latest ` - --main-script target/nextflow/workflows/process_perturbation/main.nf ` - --workspace 53907369739130 --compute-env 6TeIFgV5OY4pJCk8I0bfOh ` - --params-file ./params/process_perturbation.yaml ` - --config src/common/nextflow_helpers/labels_tw.config - - -nextflow run . \ - -main-script target/nextflow/workflows/grn_inference_granie/main.nf \ - -profile docker -with-trace -c src/common/nextflow_helpers/labels_ci.config \ - -params-file params/granie.yaml diff --git a/src/methods/multi_omics/figr/script.R b/src/methods/multi_omics/figr/script.R index 3d7a75d2c..066848090 100644 --- a/src/methods/multi_omics/figr/script.R +++ b/src/methods/multi_omics/figr/script.R @@ -27,6 +27,7 @@ dir.create(par$temp_dir, recursive = TRUE, showWarnings = TRUE) atac = readRDS(par$multiomics_atac_r) rna = readRDS(par$multiomics_rna_r) + colnames(atac) <- gsub("-", "", colnames(atac)) colnames(rna) <- gsub("-", "", colnames(rna)) diff --git a/src/process_data/multiomics/batch_correction/script.py b/src/process_data/multiomics/batch_correction/script.py deleted file mode 100644 index 89bb7932f..000000000 --- a/src/process_data/multiomics/batch_correction/script.py +++ /dev/null @@ -1,40 +0,0 @@ -import anndata as ad -import pandas as pd -import numpy as np -import scanpy as sc -import scgen -import torch -print(torch.cuda.is_available()) - -## VIASH START -par = { - 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_qc.h5ad', - "multiomics_rna_bc": 'resources/grn-benchmark/multiomics_rna_qc_bc.h5ad' -} -## VIASH END -batch_key = 'donor_id' -label_key = 'cell_type' -bulk_adata = ad.read_h5ad(par['multiomics_rna']) - -# normalize -sc.experimental.pp.normalize_pearson_residuals(bulk_adata) - -# scgen pipeline -train = bulk_adata.copy() -sc.pp.neighbors(train) -sc.tl.umap(train) - -scgen.SCGEN.setup_anndata(train, batch_key=batch_key, labels_key=label_key) -model = scgen.SCGEN(train) -model.train( - max_epochs=100, - batch_size=64, - early_stopping=True, - early_stopping_patience=25 -) - -corrected_adata = model.batch_removal() - -bulk_adata.X = corrected_adata.X -print(f"Saving the file") -bulk_adata.write_h5ad(par['multiomics_rna_bc']) \ No newline at end of file diff --git a/src/process_data/multiomics/format_data/config.vsh.yaml b/src/process_data/multiomics/format_data/config.vsh.yaml index 220dfe537..b423e5eb8 100644 --- a/src/process_data/multiomics/format_data/config.vsh.yaml +++ b/src/process_data/multiomics/format_data/config.vsh.yaml @@ -8,26 +8,15 @@ functionality: arguments: - name: --multiome_counts type: file - required: false + required: true direction: input example: resources/datasets_raw/multiome_counts.h5ad + - name: --multiomics_rna type: file required: false direction: output example: resources/grn-benchmark/multiomics_rna.h5ad - - name: --multiomics_rna_d0 - type: file - required: false - direction: output - example: resources/grn-benchmark/multiomics_rna_d0.h5ad - - - name: --multiomics_rna_d0_hvg - type: file - required: false - direction: output - example: resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad - - name: --multiomics_atac type: file required: false diff --git a/src/process_data/multiomics/format_data/script.py b/src/process_data/multiomics/format_data/script.py index de8a2af35..7184eca99 100644 --- a/src/process_data/multiomics/format_data/script.py +++ b/src/process_data/multiomics/format_data/script.py @@ -1,5 +1,6 @@ import anndata as ad import scanpy as sc +import numpy as np ## VIASH START par = { # 'multiome_counts': 'resources/datasets_raw/multiome_counts.h5ad', @@ -26,22 +27,14 @@ multiomics_rna = multiomics[:,multiomics.var.feature_types=='Gene Expression'] multiomics_rna.var = multiomics_rna.var[['gene_ids', 'interval']] -def high_coverage(adata): - threshold = 0.1 - mask = adata.X!=0 - mask_obs = (np.sum(mask, axis=1).A.flatten()/mask.shape[1])>threshold - mask_var = (np.sum(mask, axis=0).A.flatten()/mask.shape[0])>threshold - adata.obs['high_coverage'] = mask_obs - adata.var['high_coverage'] = mask_var -high_coverage(multiomics_rna) - -# hvgs -var = sc.pp.highly_variable_genes(multiomics_rna, flavor='seurat_v3', n_top_genes=7000, inplace=False) -multiomics_rna.var['highly_variable'] = var.highly_variable - -# subset to donor 0 -multiomics_rna_d0 = multiomics_rna[multiomics_rna.obs.donor_id=='donor_0', :] -multiomics_rna_d0_hvg = multiomics_rna[multiomics_rna.obs.donor_id=='donor_0', multiomics_rna.var.highly_variable] +# def high_coverage(adata): +# threshold = 0.1 +# mask = adata.X!=0 +# mask_obs = (np.sum(mask, axis=1).A.flatten()/mask.shape[1])>threshold +# mask_var = (np.sum(mask, axis=0).A.flatten()/mask.shape[0])>threshold +# adata.obs['high_coverage'] = mask_obs +# adata.var['high_coverage'] = mask_var +# high_coverage(multiomics_rna) #------ ATAC multiomics_atac = multiomics[:,multiomics.var.feature_types=='Peaks'] multiomics_atac.var = multiomics_atac.var[[]] @@ -62,6 +55,4 @@ def high_coverage(adata): multiomics_atac.obs['donor_id'] = multiomics_atac.obs['donor_id'].map(donor_map) multiomics_rna.write(par['multiomics_rna']) -multiomics_rna_h0.write(par['multiomics_rna_h0']) -multiomics_rna_h0_hvg.write(par['multiomics_rna_h0_hvg']) multiomics_atac.write(par['multiomics_atac']) \ No newline at end of file diff --git a/src/process_data/multiomics/format_resources_r/config.vsh.yaml b/src/process_data/multiomics/format_resources_r/config.vsh.yaml index 38b303e3c..8b8e50dc1 100644 --- a/src/process_data/multiomics/format_resources_r/config.vsh.yaml +++ b/src/process_data/multiomics/format_resources_r/config.vsh.yaml @@ -8,50 +8,45 @@ functionality: arguments: - name: --rna_matrix type: file - required: false + required: true direction: input - default: output/scRNA/X_matrix.mtx - + example: output/scRNA/X_matrix.mtx - name: --atac_matrix type: file - required: false + required: true direction: input - default: output/scATAC/X_matrix.mtx - + example: output/scATAC/X_matrix.mtx - name: --rna_gene_annot type: file - required: false + required: true direction: input - default: output/scRNA/annotation_gene.csv - + example: output/scRNA/annotation_gene.csv - name: --rna_cell_annot type: file - required: false + required: true direction: input - default: output/scRNA/annotation_cell.csv - + example: output/scRNA/annotation_cell.csv - name: --atac_peak_annot type: file - required: false + required: true direction: input - default: output/scATAC/annotation_gene.csv - + example: output/scATAC/annotation_gene.csv - name: --atac_cell_annot type: file - required: false + required: true direction: input - default: output/scATAC/annotation_cell.csv + example: output/scATAC/annotation_cell.csv - name: --rna_rds type: file required: false direction: output - default: resources/grn-benchmark/multiomics_r/rna.rds + example: resources/grn-benchmark/multiomics_r/rna.rds - name: --atac_rds type: file required: false direction: output - default: resources/grn-benchmark/multiomics_r/atac.rds + example: resources/grn-benchmark/multiomics_r/atac.rds diff --git a/src/process_data/multiomics/format_resources_r/script.R b/src/process_data/multiomics/format_resources_r/script.R index 62e98966f..997017f2b 100644 --- a/src/process_data/multiomics/format_resources_r/script.R +++ b/src/process_data/multiomics/format_resources_r/script.R @@ -32,6 +32,7 @@ annotation_peak_filtered <- annotation_peak[filter_indices, ] # Filter the rows in X X_filtered <- X[filter_indices, ] + # Create the SummarizedExperiment object with the filtered data atac <- SummarizedExperiment(assays = list(counts = X_filtered), rowRanges = GRanges(annotation_peak_filtered$seqname, diff --git a/src/process_data/multiomics/multiome_matrix/config.vsh.yaml b/src/process_data/multiomics/multiome_matrix/config.vsh.yaml index 3aedd936e..bfca7c0df 100644 --- a/src/process_data/multiomics/multiome_matrix/config.vsh.yaml +++ b/src/process_data/multiomics/multiome_matrix/config.vsh.yaml @@ -8,51 +8,50 @@ functionality: arguments: - name: --multiomics_rna type: file - required: false + required: true direction: input - default: resources/grn-benchmark/multiomics_rna.h5ad + example: resources/grn-benchmark/multiomics_rna.h5ad - name: --multiomics_atac type: file - required: false + required: true direction: input - default: resources/grn-benchmark/multiomics_atac.h5ad + example: resources/grn-benchmark/multiomics_atac.h5ad - name: --rna_matrix type: file required: false direction: output - default: output/scRNA/X_matrix.mtx - + example: output/scRNA/X_matrix.mtx - name: --atac_matrix type: file required: false direction: output - default: output/scATAC/X_matrix.mtx + example: output/scATAC/X_matrix.mtx - name: --rna_gene_annot type: file required: false direction: output - default: output/scRNA/annotation_gene.csv + example: output/scRNA/annotation_gene.csv - name: --rna_cell_annot type: file required: false direction: output - default: output/scRNA/annotation_cell.csv + example: output/scRNA/annotation_cell.csv - name: --atac_peak_annot type: file required: false direction: output - default: output/scATAC/annotation_gene.csv + example: output/scATAC/annotation_gene.csv - name: --atac_cell_annot type: file required: false direction: output - default: output/scATAC/annotation_cell.csv + example: output/scATAC/annotation_cell.csv resources: - type: python_script path: script.py diff --git a/src/process_data/multiomics/subset_hvg/config.vsh.yaml b/src/process_data/multiomics/subset_hvg/config.vsh.yaml new file mode 100644 index 000000000..0e6560bb6 --- /dev/null +++ b/src/process_data/multiomics/subset_hvg/config.vsh.yaml @@ -0,0 +1,45 @@ + +functionality: + name: subset_hvg + namespace: "multiomics" + info: + label: subset_hvg + summary: "Receives multiomics data and subsets it for hvg" + arguments: + - name: --multiomics_rna + type: file + required: true + direction: input + example: resources/grn-benchmark/multiomics_rna.h5ad + - name: --multiomics_atac + type: file + required: true + direction: input + example: resources/grn-benchmark/multiomics_atac.h5ad + + - name: --multiomics_rna_d0_hvg + type: file + required: false + direction: output + example: resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad + - name: --multiomics_atac_d0 + type: file + required: false + direction: output + example: resources/grn-benchmark/multiomics_atac_d0.h5ad + + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: [ scikit-misc ] + + + - type: native + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/process_data/multiomics/subset_hvg/script.py b/src/process_data/multiomics/subset_hvg/script.py new file mode 100644 index 000000000..72e287313 --- /dev/null +++ b/src/process_data/multiomics/subset_hvg/script.py @@ -0,0 +1,29 @@ +import anndata as ad +import scanpy as sc +## VIASH START +par = { + # 'multiome_counts': 'resources/datasets_raw/multiome_counts.h5ad', + # 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', + # 'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad' +} +## VIASH END +multiomics_rna = ad.read_h5ad(par['multiomics_rna']) +multiomics_atac = ad.read_h5ad(par['multiomics_atac']) + +multiomics_rna_d0 = multiomics_rna[multiomics_rna.obs.donor_id=='donor_0', :] +multiomics_atac_d0 = multiomics_atac[multiomics_atac.obs.donor_id=='donor_0', :] + + +# hvgs +var = sc.pp.highly_variable_genes(multiomics_rna_d0, flavor='seurat_v3', n_top_genes=7000, inplace=False) +multiomics_rna_d0.var['highly_variable'] = var.highly_variable + +multiomics_rna_d0_hvg = multiomics_rna_d0[:, multiomics_rna_d0.var.highly_variable] + +print(multiomics_rna_d0_hvg.shape) +print(multiomics_atac_d0.shape) + + + +multiomics_rna_d0_hvg.write(par['multiomics_rna_d0_hvg']) +multiomics_atac_d0.write(par['multiomics_atac_d0']) \ No newline at end of file diff --git a/src/workflows/process_multiomics/config.vsh.yaml b/src/workflows/process_multiomics/config.vsh.yaml index d93858fb8..3e30ecac6 100644 --- a/src/workflows/process_multiomics/config.vsh.yaml +++ b/src/workflows/process_multiomics/config.vsh.yaml @@ -13,30 +13,52 @@ functionality: type: file required: true direction: input - default: resources/datasets_raw/multiome_counts.h5ad + example: resources/datasets_raw/multiome_counts.h5ad description: multiomics data at baseline - name: --multiomics_rna type: file - required: true + required: false direction: output - default: resources/grn-benchmark/multiomics_rna.h5ad + example: resources/grn-benchmark/multiomics_rna.h5ad - name: --multiomics_atac type: file - required: true + required: false + direction: output + example: resources/grn-benchmark/multiomics_atac.h5ad + + - name: --multiomics_rna_d0_hvg + type: file + required: false + direction: output + example: resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad + - name: --multiomics_atac_d0 + type: file + required: false direction: output - default: resources/grn-benchmark/multiomics_atac.h5ad + example: resources/grn-benchmark/multiomics_atac_d0.h5ad - name: --rna_rds type: file - required: true + required: false direction: output - default: resources/grn-benchmark/multiomics_rna.rds + example: resources/grn-benchmark/multiomics_rna.rds - name: --atac_rds type: file - required: true + required: false + direction: output + example: resources/grn-benchmark/multiomics_atac.rds + + - name: --rna_rds_d0_hvg + type: file + required: false + direction: output + example: resources/grn-benchmark/multiomics_rna_d0_hvg.rds + - name: --atac_rds_d0 + type: file + required: false direction: output - default: resources/grn-benchmark/multiomics_atac.rds + example: resources/grn-benchmark/multiomics_atac_d0.rds resources: - type: nextflow_script @@ -46,6 +68,7 @@ functionality: - name: multiomics/format_data - name: multiomics/multiome_matrix - name: multiomics/format_resources_r + - name: multiomics/subset_hvg platforms: - type: nextflow directives: diff --git a/src/workflows/process_multiomics/main.nf b/src/workflows/process_multiomics/main.nf index 694c2bca1..0e2e281d1 100644 --- a/src/workflows/process_multiomics/main.nf +++ b/src/workflows/process_multiomics/main.nf @@ -31,7 +31,33 @@ workflow run_wf { atac_rds: "atac_rds"] ) - | setState(["multiomics_rna", "multiomics_atac", "rna_rds", "atac_rds"]) + | subset_hvg.run( + fromState: [multiomics_rna: "multiomics_rna", multiomics_atac: "multiomics_atac"], + toState: [multiomics_rna_d0_hvg: "multiomics_rna_d0_hvg", multiomics_atac_d0: "multiomics_atac_d0"] + ) + + | multiome_matrix.run( + fromState: [multiomics_rna: "multiomics_rna_d0_hvg", multiomics_atac: "multiomics_atac_d0"], + toState: [rna_matrix_d0: "rna_matrix", + atac_matrix_d0: "atac_matrix", + rna_gene_annot_d0: "rna_gene_annot", + rna_cell_annot_d0: "rna_cell_annot", + atac_peak_annot_d0: "atac_peak_annot", + atac_cell_annot_d0: "atac_cell_annot"] + ) + + | format_resources_r.run( + fromState: [rna_matrix: "rna_matrix_d0", + atac_matrix: "atac_matrix_d0", + rna_gene_annot: "rna_gene_annot_d0", + rna_cell_annot: "rna_cell_annot_d0", + atac_peak_annot: "atac_peak_annot_d0", + atac_cell_annot: "atac_cell_annot_d0"], + toState: [rna_rds_d0_hvg: "rna_rds", + atac_rds_d0: "atac_rds"] + ) + + | setState(["multiomics_rna", "multiomics_atac", "rna_rds", "atac_rds", "multiomics_rna_d0_hvg", "multiomics_atac_d0", "rna_rds_d0_hvg", "atac_rds_d0"]) emit: output_ch