scenicplus pipeline fixed

openproblems-bio · Oct 8, 2024 · 1180b20 · 1180b20
1 parent 652b00e
commit 1180b20
Show file tree

Hide file tree

Showing 7 changed files with 50 additions and 48 deletions.
diff --git a/runs.ipynb b/runs.ipynb
@@ -819,38 +819,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Submitted batch job 7757148\n"
+      "Submitted batch job 7757181\n"
      ]
     }
    ],
    "source": [
     "if True:\n",
+    "    par = {\n",
+    "        'methods': ['scglue'],\n",
+    "        'models_dir': 'resources/grn_models/',\n",
+    "        'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', \n",
+    "        'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad', \n",
+    "        'num_workers': 20,\n",
+    "        'mem': \"120GB\",\n",
+    "        'time': \"48:00:00\"\n",
+    "    }\n",
+    "    \n",
     "    # par = {\n",
-    "    #     'methods': ['scglue'],\n",
-    "    #     'models_dir': 'resources/grn_models/',\n",
-    "    #     'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', \n",
-    "    #     'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad', \n",
+    "    #     'methods': ['scenicplus'],\n",
+    "    #     'models_dir': 'resources/grn_models/d0_hvg',\n",
+    "    #     'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad', \n",
+    "    #     'multiomics_atac': 'resources/grn-benchmark/multiomics_atac_d0.h5ad', \n",
     "    #     'num_workers': 20,\n",
     "    #     'mem': \"250GB\",\n",
     "    #     'time': \"48:00:00\"\n",
     "    # }\n",
-    "    \n",
-    "    par = {\n",
-    "        'methods': ['scenicplus'],\n",
-    "        'models_dir': 'resources/grn_models/d0_hvg',\n",
-    "        'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad', \n",
-    "        'multiomics_atac': 'resources/grn-benchmark/multiomics_atac_d0.h5ad', \n",
-    "        'num_workers': 20,\n",
-    "        'mem': \"250GB\",\n",
-    "        'time': \"48:00:00\"\n",
-    "    }\n",
     "\n",
     "    for method in par['methods']:\n",
     "        par['prediction'] = f\"{par['models_dir']}/{method}.csv\"\n",

diff --git a/src/methods/multi_omics/scenicplus/main.py b/src/methods/multi_omics/scenicplus/main.py
@@ -721,8 +721,6 @@ def snakemake_pipeline(par):
     cwd = os.getcwd()
     print(cwd)
 
-
-
     settings['input_data']['cisTopic_obj_fname'] = f"{cwd}/{par['cistopic_object']}"
     settings['input_data']['GEX_anndata_fname'] = f"{cwd}/{os.path.join(par['temp_dir'], 'rna.h5ad')}"
     settings['input_data']['region_set_folder'] = f"{cwd}/{os.path.join(par['temp_dir'], 'region_sets')}"

diff --git a/src/methods/multi_omics/scenicplus/script.py b/src/methods/multi_omics/scenicplus/script.py
@@ -68,21 +68,21 @@ def main(par):
     par['MALLET_PATH'] = os.path.join(par['temp_dir'], 'Mallet-202108', 'bin', 'mallet')
     os.makedirs(par['atac_dir'], exist_ok=True)
 
-    # print('------- download_databases -------')
-    # download_databases(par)
-    # print_memory_usage()
-    # print('------- process_peak -------')
-    # process_peak(par)
-    # print_memory_usage()
-    # print('------- run_cistopic -------')
-    # run_cistopic(par)
-    # print_memory_usage()
-    # print('------- process_topics -------')
-    # process_topics(par)
-    # print_memory_usage()
-    # print('------- preprocess_rna -------')
-    # preprocess_rna(par)
-    # print_memory_usage()
+    print('------- download_databases -------')
+    download_databases(par)
+    print_memory_usage()
+    print('------- process_peak -------')
+    process_peak(par)
+    print_memory_usage()
+    print('------- run_cistopic -------')
+    run_cistopic(par)
+    print_memory_usage()
+    print('------- process_topics -------')
+    process_topics(par)
+    print_memory_usage()
+    print('------- preprocess_rna -------')
+    preprocess_rna(par)
+    print_memory_usage()
     print('------- snakemake_pipeline -------')
     snakemake_pipeline(par)
     print_memory_usage()

diff --git a/...lti_omics/scenicplus_ns/config.novsh.yaml → ...multi_omics/scenicplus_ns/config.vsh.yaml b/...lti_omics/scenicplus_ns/config.novsh.yaml → ...multi_omics/scenicplus_ns/config.vsh.yaml
@@ -20,10 +20,14 @@ functionality:
       description: "Whether to perform quality control."
     - name: --cell_topic
       type: file 
-      default: output/cell_topic.csv
       required: false
       direction: output
       description: "Cell-topics prob scores"
+    - name: --grn_extended
+      type: file 
+      required: false
+      direction: output
+      description: "Source-target-peak triplets"
 
 
   resources:
@@ -36,4 +40,4 @@ functionality:
 platforms:
   - type: nextflow
     directives:
-      label: [ midtime, highmem, highcpu ]
+      label: [ onedaytime, highmem, highcpu ]
diff --git a/src/methods/multi_omics/scenicplus_ns/main.nf b/src/methods/multi_omics/scenicplus_ns/main.nf
@@ -13,10 +13,10 @@ workflow run_wf {
               num_workers: "num_workers"
 
               ],
-      toState: [prediction:"prediction", cell_topic:"cell_topic", scplus_mdata:"scplus_mdata"]
+      toState: [prediction:"prediction", cell_topic:"cell_topic", scplus_mdata:"scplus_mdata", grn_extended:"grn_extended"]
     )
 
-    | setState(["prediction", "cell_topic", "scplus_mdata"])
+    | setState(["prediction", "cell_topic", "scplus_mdata", "grn_extended"])
 
   emit:
   output_ch

diff --git a/src/methods/multi_omics/scglue/main.py b/src/methods/multi_omics/scglue/main.py
@@ -262,10 +262,10 @@ def prune_grn(par):
         "--annotations_fname", f"{par['temp_dir']}/ctx_annotation.tsv",
         "--expression_mtx_fname", f"{par['temp_dir']}/rna.loom",
         "--output", f"{par['temp_dir']}/pruned_grn.csv",
-        # "--top_n_targets", str(par['top_n_targets']),
-        # "--rank_threshold", str(par['rank_threshold']),
-        # "--auc_threshold", "0.1",
-        # "--nes_threshold", str(par['nes_threshold']), 
+        "--top_n_targets", str(par['top_n_targets']),
+        "--rank_threshold", str(par['rank_threshold']),
+        "--auc_threshold", "0.1",
+        "--nes_threshold", str(par['nes_threshold']), 
         "--min_genes", "1",
         "--num_workers", f"{par['num_workers']}",
         "--cell_id_attribute", "obs_id", # be sure that obs_id is in obs and name is in var
@@ -300,11 +300,11 @@ def main(par):
     # preprocess(par)
     # print('----- training ---- ', flush=True)
     # training(par)
-    print('----- create_prior ---- ', flush=True)
-    create_prior(par)
-    print('----- pyscenic_grn ---- ', flush=True)
-    pyscenic_grn(par)
-    print('----- prune_grn ---- ', flush=True)
+    # print('----- create_prior ---- ', flush=True)
+    # create_prior(par)
+    # print('----- pyscenic_grn ---- ', flush=True)
+    # pyscenic_grn(par)
+    # print('----- prune_grn ---- ', flush=True)
     prune_grn(par)
     print('Curate predictions', flush=True)
     pruned_grn = pd.read_csv(

diff --git a/src/methods/multi_omics/scglue/script.py b/src/methods/multi_omics/scglue/script.py
@@ -11,8 +11,8 @@
   "num_workers": 20,
   "prediction": "output/scglue_d0_hvg.csv",
   "max_n_links": 50000,
-  "nes_threshold": 1.5,
-  "rank_threshold": 1500,
+  "nes_threshold": 1,
+  "rank_threshold": 5000,
   "top_n_targets": 100,
   'normalize': False,
   'extend_range': 150000