Skip to content

Commit

Permalink
fix configs
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Sep 21, 2024
1 parent 99284d3 commit 2b55ea9
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 166 deletions.
1 change: 1 addition & 0 deletions src/api/comp_process_dataset.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
namespace: data_processors
info:
type: process_dataset
type_info:
Expand Down
29 changes: 7 additions & 22 deletions src/data_processors/process_dataset/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,34 +1,19 @@
__merge__: ../../api/comp_data_processor.yaml
__merge__: /src/api/comp_process_dataset.yaml
name: process_dataset
arguments:
- name: "--method"
type: "string"
description: "The process method to assign train/test."
choices: ["batch", "random"]
default: "batch"
- name: "--obs_label"
type: "string"
description: "Which .obs slot to use as label."
default: "cell_type"
- name: "--obs_batch"
type: "string"
description: "Which .obs slot to use as batch covariate."
default: "batch"
- name: "--seed"
type: "integer"
description: "A seed for the subsampling."
example: 123
description: Preprocess adata object for data integration
resources:
- type: python_script
path: script.py
- path: /common/helper_functions/subset_h5ad_by_format.py

engines:
- type: docker
image: openproblems/base_python:1.0.0

setup:
- type: python
pypi:
- scib==1.1.5
runners:
- type: executable
- type: nextflow
directives:
label: [highmem, midcpu, midtime]
label: [highmem, midcpu, midtime]
87 changes: 39 additions & 48 deletions src/data_processors/process_dataset/script.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,77 @@
import sys
import random
import numpy as np
import anndata as ad
import openproblems as op

## VIASH START
par = {
'input': 'resources_test/common/pancreas/dataset.h5ad',
'method': 'batch',
'seed': None,
'obs_batch': 'batch',
'hvgs': 2000,
'obs_label': 'cell_type',
'output_train': 'train.h5ad',
'output_test': 'test.h5ad',
'output_solution': 'solution.h5ad'
'obs_batch': 'batch',
'subset_hvg': False,
'output': 'output.h5ad'
}
meta = {
'resources_dir': 'target/executable/data_processors/process_dataset',
'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml'
"config": "target/nextflow/batch_integration/process_dataset/.config.vsh.yaml",
"resources_dir": "src/common/helper_functions"
}
## VIASH END

# import helper functions
sys.path.append(meta['resources_dir'])
from subset_h5ad_by_format import subset_h5ad_by_format

print(">> Load config", flush=True)
config = op.project.read_viash_config(meta["config"])

# set seed if need be
if par["seed"]:
print(f">> Setting seed to {par['seed']}")
random.seed(par["seed"])
print('Read input', flush=True)
input = ad.read_h5ad(par['input'])

def compute_batched_hvg(adata, n_hvgs):
if n_hvgs > adata.n_vars or n_hvgs <= 0:
hvg_list = adata.var_names.tolist()
else:
import scib
scib_adata = adata.copy()
scib_adata.X = scib_adata.layers['normalized'].copy()
hvg_list = scib.pp.hvg_batch(
scib_adata,
batch_key='batch',
target_genes=n_hvgs,
adataOut=False
)
adata.var['hvg'] = adata.var_names.isin(hvg_list)
return adata

print(">> Load data", flush=True)
adata = ad.read_h5ad(par["input"])
print("input:", adata)
print(f'Select {par["hvgs"]} highly variable genes', flush=True)
adata_with_hvg = compute_batched_hvg(input, n_hvgs=par['hvgs'])

print(f">> Process data using {par['method']} method")
if par["method"] == "batch":
batch_info = adata.obs[par["obs_batch"]]
batch_categories = batch_info.dtype.categories
test_batches = random.sample(list(batch_categories), 1)
is_test = [ x in test_batches for x in batch_info ]
elif par["method"] == "random":
train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False)
is_test = [ not x in train_ix for x in range(0, adata.n_obs) ]
if par['subset_hvg']:
print('Subsetting to HVG dimensions', flush=True)
adata_with_hvg = adata_with_hvg[:, adata_with_hvg.var['hvg']].copy()

# subset the different adatas
print(">> Figuring which data needs to be copied to which output file", flush=True)
print(">> Figuring out which data needs to be copied to which output file", flush=True)
# use par arguments to look for label and batch value in different slots
slot_mapping = {
"obs": {
"label": par["obs_label"],
"batch": par["obs_batch"],
}
}

print(">> Creating train data", flush=True)
output_train = subset_h5ad_by_format(
adata[[not x for x in is_test]],
print(">> Create output object", flush=True)
output_dataset = subset_h5ad_by_format(
adata_with_hvg,
config,
"output_train",
"output_dataset",
slot_mapping
)

print(">> Creating test data", flush=True)
output_test = subset_h5ad_by_format(
adata[is_test],
config,
"output_test",
slot_mapping
)

print(">> Creating solution data", flush=True)
output_solution = subset_h5ad_by_format(
adata[is_test],
adata_with_hvg,
config,
"output_solution",
slot_mapping
)

print(">> Writing data", flush=True)
output_train.write_h5ad(par["output_train"])
output_test.write_h5ad(par["output_test"])
output_solution.write_h5ad(par["output_solution"])
print('Writing adatas to file', flush=True)
output_dataset.write_h5ad(par['output_dataset'], compression='gzip')
output_solution.write_h5ad(par['output_solution'], compression='gzip')
19 changes: 0 additions & 19 deletions src/process_dataset/config.vsh.yaml

This file was deleted.

77 changes: 0 additions & 77 deletions src/process_dataset/script.py

This file was deleted.

2 changes: 2 additions & 0 deletions src/workflows/process_datasets/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
name: process_datasets
namespace: workflows

status: disabled

argument_groups:
- name: Inputs
arguments:
Expand Down
2 changes: 2 additions & 0 deletions src/workflows/run_benchmark/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
name: run_benchmark
namespace: workflows

status: disabled

argument_groups:
- name: Inputs
arguments:
Expand Down

0 comments on commit 2b55ea9

Please sign in to comment.