Skip to content

Commit

Permalink
figr: test 4
Browse files Browse the repository at this point in the history
  • Loading branch information
matin authored and matin committed Jul 25, 2024
1 parent 96259e5 commit a11432f
Show file tree
Hide file tree
Showing 7 changed files with 350 additions and 182 deletions.
182 changes: 0 additions & 182 deletions notebooks/preprocess.ipynb

This file was deleted.

27 changes: 27 additions & 0 deletions src/process_data/batch_corrrection_scgen/config.novsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
functionality:
name: batch_correction_scgen
info:
label: batch_correction_scgen
summary: "Correct batch effects using scgen"


arguments:
- name: --perturbation_data
type: file
required: true



resources:
- type: python_script
path: script.py


platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4

- type: native
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
15 changes: 15 additions & 0 deletions src/process_data/batch_corrrection_scgen/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# !pip install sctk anndata
# !aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/sc_counts.h5ad ./resources_raw/ --no-sign-request

import anndata as ad
import pandas as pd
import numpy as np
import sctk
from scipy import sparse
import scanpy as sc

par = {
'sc_counts': 'resources/raw-data/sc_counts.h5ad',
'perturbation_data': 'resources/raw-data/perturbation_data.h5ad',

}
33 changes: 33 additions & 0 deletions src/process_data/multiome/config.novsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
functionality:
name: multiome
info:
label: multiome
summary: "Download multiome, splits to rna and atac and saves seperately."

arguments:
- name: --multiome_counts
type: file
required: True
direction: input
- name: --multiomics_rna
type: file
required: true
direction: output
- name: --multiomics_atac
type: file
required: true
direction: output

resources:
- type: python_script
path: script.py


platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4

- type: native
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
28 changes: 28 additions & 0 deletions src/process_data/multiome/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import anndata as ad
#!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-08-31_sc_multiome_expression_atac.h5ad ./resources/raw-data/ --no-sign-request
# mv resources/raw-data/2023-08-31_sc_multiome_expression_atac.h5ad resources/raw-data/multiome.h5ad
par = {
'multiome_counts': 'resources/raw-data/multiome.h5ad',
'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad',
'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad'
}
# Load
multiomics = ad.read_h5ad(par['multiome_counts'])
multiomics.X = multiomics.layers['counts']
del multiomics.layers
multiomics.var.index.name='location'
multiomics.obs.index.name='obs_id'

# map the cell types
cell_types_o = multiomics.obs.cell_type.unique()
T_cell_types = ['T regulatory cells', 'T cells CD8+', 'T cells CD4+']
cell_type_map = {cell_type: 'T cells' if cell_type in T_cell_types else cell_type for cell_type in cell_types_o}
multiomics.obs['cell_type'] = multiomics.obs['cell_type'].map(cell_type_map)
# RNA
multiomics_rna = multiomics[:,multiomics.var.feature_types=='Gene Expression']
multiomics_rna.var = multiomics_rna.var[['gene_ids', 'interval']]
multiomics_rna.write(par['multiomics_rna'])
# ATAC
multiomics_atac = multiomics[:,multiomics.var.feature_types=='Peaks']
multiomics_atac.var = multiomics_atac.var[[]]
multiomics_atac.write(par['multiomics_atac'])
36 changes: 36 additions & 0 deletions src/process_data/sc_counts/config.novsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
functionality:
name: sc_counts
info:
label: sc_counts
summary: "Porcesses sc counts of perturbation data to generate pseudobulked"
description: |
"
It conducts QC on sc level to remove low quality cell and genes.
Then, sc counts are pseudobulked and filtered for outliers compounds, samples with low quality cells, and genes with low coverage.
Finally, it normalized the counts data.
"

arguments:
- name: --sc_counts
type: file
required: True
direction: input
- name: --perturbation_data
type: file
required: true
direction: output


resources:
- type: python_script
path: script.py


platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4

- type: native
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
Loading

0 comments on commit a11432f

Please sign in to comment.