figr: test 4

openproblems-bio · Jul 25, 2024 · a11432f · a11432f
1 parent 96259e5
commit a11432f
Show file tree

Hide file tree

Showing 7 changed files with 350 additions and 182 deletions.
diff --git a/notebooks/preprocess.ipynb b/notebooks/preprocess.ipynb
diff --git a/src/process_data/batch_corrrection_scgen/config.novsh.yaml b/src/process_data/batch_corrrection_scgen/config.novsh.yaml
@@ -0,0 +1,27 @@
+functionality:
+  name: batch_correction_scgen
+  info:
+    label: batch_correction_scgen
+    summary: "Correct batch effects using scgen"
+
+
+  arguments:
+    - name: --perturbation_data
+      type: file 
+      required: true
+
+
+
+  resources:
+    - type: python_script
+      path: script.py
+
+
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+
+  - type: native
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/process_data/batch_corrrection_scgen/script.py b/src/process_data/batch_corrrection_scgen/script.py
@@ -0,0 +1,15 @@
+# !pip install sctk anndata
+# !aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/sc_counts.h5ad  ./resources_raw/ --no-sign-request
+
+import anndata as ad 
+import pandas as pd
+import numpy as np
+import sctk
+from scipy import sparse
+import scanpy as sc
+
+par = {
+    'sc_counts': 'resources/raw-data/sc_counts.h5ad',
+    'perturbation_data': 'resources/raw-data/perturbation_data.h5ad',
+
+}
diff --git a/src/process_data/multiome/config.novsh.yaml b/src/process_data/multiome/config.novsh.yaml
@@ -0,0 +1,33 @@
+functionality:
+  name: multiome
+  info:
+    label: multiome
+    summary: "Download multiome, splits to rna and atac and saves seperately."
+
+  arguments:
+    - name: --multiome_counts
+      type: file 
+      required: True
+      direction: input
+    - name: --multiomics_rna
+      type: file 
+      required: true
+      direction: output
+    - name: --multiomics_atac
+      type: file 
+      required: true
+      direction: output
+
+  resources:
+    - type: python_script
+      path: script.py
+
+
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+
+  - type: native
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/process_data/multiome/script.py b/src/process_data/multiome/script.py
@@ -0,0 +1,28 @@
+import anndata as ad
+#!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-08-31_sc_multiome_expression_atac.h5ad ./resources/raw-data/ --no-sign-request
+# mv resources/raw-data/2023-08-31_sc_multiome_expression_atac.h5ad resources/raw-data/multiome.h5ad
+par = {
+    'multiome_counts': 'resources/raw-data/multiome.h5ad',
+    'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad',
+    'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad'
+}
+# Load 
+multiomics = ad.read_h5ad(par['multiome_counts'])
+multiomics.X = multiomics.layers['counts']
+del multiomics.layers
+multiomics.var.index.name='location'
+multiomics.obs.index.name='obs_id'
+
+# map the cell types
+cell_types_o = multiomics.obs.cell_type.unique()
+T_cell_types = ['T regulatory cells', 'T cells CD8+', 'T cells CD4+']
+cell_type_map = {cell_type: 'T cells' if cell_type in T_cell_types else cell_type for cell_type in cell_types_o}
+multiomics.obs['cell_type'] = multiomics.obs['cell_type'].map(cell_type_map)
+# RNA
+multiomics_rna = multiomics[:,multiomics.var.feature_types=='Gene Expression']
+multiomics_rna.var = multiomics_rna.var[['gene_ids', 'interval']]
+multiomics_rna.write(par['multiomics_rna'])
+# ATAC
+multiomics_atac = multiomics[:,multiomics.var.feature_types=='Peaks']
+multiomics_atac.var = multiomics_atac.var[[]]
+multiomics_atac.write(par['multiomics_atac'])
diff --git a/src/process_data/sc_counts/config.novsh.yaml b/src/process_data/sc_counts/config.novsh.yaml
@@ -0,0 +1,36 @@
+functionality:
+  name: sc_counts
+  info:
+    label: sc_counts
+    summary: "Porcesses sc counts of perturbation data to generate pseudobulked"
+    description: |
+    "
+    It conducts QC on sc level to remove low quality cell and genes. 
+    Then, sc counts are pseudobulked and filtered for outliers compounds, samples with low quality cells, and genes with low coverage. 
+    Finally, it normalized the counts data. 
+    "
+
+  arguments:
+    - name: --sc_counts
+      type: file 
+      required: True
+      direction: input
+    - name: --perturbation_data
+      type: file 
+      required: true
+      direction: output
+
+
+  resources:
+    - type: python_script
+      path: script.py
+
+
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+
+  - type: native
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]