Skip to content

Commit

Permalink
fix workflows
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Sep 22, 2024
1 parent 2d3ef4c commit 0603061
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 218 deletions.
18 changes: 14 additions & 4 deletions scripts/create_resources/test_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ DATASET_DIR=resources_test/task_batch_integration
mkdir -p $DATASET_DIR

# process dataset
viash run src/process_dataset/config.vsh.yaml -- \
viash run src/data_processors/process_dataset/config.vsh.yaml -- \
--input "$RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad" \
--output_dataset "$DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad" \
--output_solution "$DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad"
Expand All @@ -25,18 +25,28 @@ viash run src/methods/combat/config.vsh.yaml -- \
--output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad

# run transformer
viash run src/transformers/transform/config.vsh.yaml -- \
viash run src/data_processors/transform/config.vsh.yaml -- \
--input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad \
--input_dataset $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \
--expected_method_types feature \
--output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad

# run one metric
viash run src/metrics/accuracy/config.vsh.yaml -- \
--input_prediction $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad \
viash run src/metrics/graph_connectivity/config.vsh.yaml -- \
--input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad \
--input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \
--output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad

# write the state file
cat > $DATASET_DIR/state.yaml << HERE
id: cxg_mouse_pancreas_atlas
output_dataset: !file dataset.h5ad
output_solution: !file solution.h5ad
output_integrated: !file integrated.h5ad
output_integrated_full: !file integrated_full.h5ad
output_score: !file score.h5ad
HERE

# only run this if you have access to the openproblems-data bucket
aws s3 sync --profile op \
"resources_test/task_batch_integration" \
Expand Down
10 changes: 1 addition & 9 deletions scripts/run_benchmark/run_test_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,6 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

# remove this when you have implemented the script
echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it."
echo " Step 1: replace 'task_batch_integration' with the name of the task in the following command."
echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs"
echo " Step 3: replace the settings parameter to fit your run_benchmark outputs"
echo " Step 4: remove this message"
exit 1

set -e

echo "Running benchmark on test data"
Expand All @@ -26,7 +18,7 @@ publish_dir="resources/results/${RUN_ID}"
# write the parameters to file
cat > /tmp/params.yaml << HERE
input_states: s3://openproblems-data/resources_test/task_batch_integration/**/state.yaml
rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE
Expand Down
2 changes: 1 addition & 1 deletion src/api/comp_transformer.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
namespace: transformers
namespace: data_processors
info:
type: transformer
type_info:
Expand Down
File renamed without changes.
File renamed without changes.
38 changes: 16 additions & 22 deletions src/workflows/process_datasets/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,29 +1,23 @@
name: process_datasets
namespace: workflows

status: disabled

argument_groups:
# - name: Inputs
# arguments:
# - name: "--input"
# __merge__: /src/api/file_common_dataset.yaml
# required: true
# direction: input
# - name: Outputs
# arguments:
# - name: "--output_train"
# __merge__: /src/api/file_train_h5ad.yaml
# required: true
# direction: output
# - name: "--output_test"
# __merge__: /src/api/file_test_h5ad.yaml
# required: true
# direction: output
# - name: "--output_solution"
# __merge__: /src/api/file_solution.yaml
# required: true
# direction: output
- name: Inputs
arguments:
- name: "--input"
__merge__: /src/api/file_common_dataset.yaml
required: true
direction: input
- name: Outputs
arguments:
- name: "--output_dataset"
__merge__: /src/api/file_dataset.yaml
required: true
direction: output
- name: "--output_solution"
__merge__: /src/api/file_solution.yaml
required: true
direction: output

resources:
- type: nextflow_script
Expand Down
125 changes: 3 additions & 122 deletions src/workflows/process_datasets/main.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
include { findArgumentSchema } from "${meta.resources_dir}/helper.nf"

workflow auto {
findStatesTemp(params, meta.config)
findStates(params, meta.config)
| meta.workflow.run(
auto: [publish: "state"]
)
Expand Down Expand Up @@ -41,133 +41,14 @@ workflow run_wf {
| process_dataset.run(
fromState: [ input: "dataset" ],
toState: [
output_train: "output_train",
output_test: "output_test",
output_dataset: "output_dataset",
output_solution: "output_solution"
]
)

// only output the files for which an output file was specified
| setState(["output_train", "output_test", "output_solution"])
| setState(["output_dataset", "output_solution"])

emit:
output_ch
}


// temp fix for rename_keys typo

def findStatesTemp(Map params, Map config) {
def auto_config = deepClone(config)
def auto_params = deepClone(params)

auto_config = auto_config.clone()
// override arguments
auto_config.argument_groups = []
auto_config.arguments = [
[
type: "string",
name: "--id",
description: "A dummy identifier",
required: false
],
[
type: "file",
name: "--input_states",
example: "/path/to/input/directory/**/state.yaml",
description: "Path to input directory containing the datasets to be integrated.",
required: true,
multiple: true,
multiple_sep: ";"
],
[
type: "string",
name: "--filter",
example: "foo/.*/state.yaml",
description: "Regex to filter state files by path.",
required: false
],
// to do: make this a yaml blob?
[
type: "string",
name: "--rename_keys",
example: ["newKey1:oldKey1", "newKey2:oldKey2"],
description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.",
required: false,
multiple: true,
multiple_sep: ";"
],
[
type: "string",
name: "--settings",
example: '{"output_dataset": "dataset.h5ad", "k": 10}',
description: "Global arguments as a JSON glob to be passed to all components.",
required: false
]
]
if (!(auto_params.containsKey("id"))) {
auto_params["id"] = "auto"
}

// run auto config through processConfig once more
auto_config = processConfig(auto_config)

workflow findStatesTempWf {
helpMessage(auto_config)

output_ch =
channelFromParams(auto_params, auto_config)
| flatMap { autoId, args ->

def globalSettings = args.settings ? readYamlBlob(args.settings) : [:]

// look for state files in input dir
def stateFiles = args.input_states

// filter state files by regex
if (args.filter) {
stateFiles = stateFiles.findAll{ stateFile ->
def stateFileStr = stateFile.toString()
def matcher = stateFileStr =~ args.filter
matcher.matches()}
}

// read in states
def states = stateFiles.collect { stateFile ->
def state_ = readTaggedYaml(stateFile)
[state_.id, state_]
}

// construct renameMap
if (args.rename_keys) {
def renameMap = args.rename_keys.collectEntries{renameString ->
def split = renameString.split(":")
assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'"
split
}

// rename keys in state, only let states through which have all keys
// also add global settings
states = states.collectMany{id, state ->
def newState = [:]

for (key in renameMap.keySet()) {
def origKey = renameMap[key]
if (!(state.containsKey(origKey))) {
return []
}
newState[key] = state[origKey]
}

[[id, globalSettings + newState]]
}
}

states
}
emit:
output_ch
}

return findStatesTempWf
}
60 changes: 40 additions & 20 deletions src/workflows/run_benchmark/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
name: run_benchmark
namespace: workflows

status: disabled

argument_groups:
- name: Inputs
arguments:
# - name: "--input_train"
# __merge__: /src/api/file_train_h5ad.yaml
# type: file
# direction: input
# required: true
# - name: "--input_test"
# __merge__: /src/api/file_test_h5ad.yaml
# type: file
# direction: input
# required: true
# - name: "--input_solution"
# __merge__: /src/api/file_solution.yaml
# type: file
# direction: input
# required: true
- name: "--input_dataset"
__merge__: /src/api/file_dataset.yaml
type: file
direction: input
required: true
- name: "--input_solution"
__merge__: /src/api/file_solution.yaml
type: file
direction: input
required: true
- name: Outputs
arguments:
- name: "--output_scores"
Expand Down Expand Up @@ -66,9 +59,36 @@ resources:
dependencies:
- name: h5ad/extract_uns_metadata
repository: core
- name: control_methods/true_labels
- name: methods/logistic_regression
- name: metrics/accuracy
- name: control_methods/embed_cell_types
- name: control_methods/embed_cell_types_jittered
- name: control_methods/no_integration
- name: control_methods/no_integration_batch
- name: control_methods/shuffle_integration
- name: control_methods/shuffle_integration_by_batch
- name: control_methods/shuffle_integration_by_cell_type
- name: methods/bbknn
- name: methods/combat
- name: methods/fastmnn
- name: methods/liger
- name: methods/mnn_correct
- name: methods/mnnpy
- name: methods/pyliger
- name: methods/scalex
- name: methods/scanorama
- name: methods/scanvi
- name: methods/scvi
- name: metrics/asw_batch
- name: metrics/aws_label
- name: metrics/cell_cycle_conservation
- name: metrics/clustering_overlap
- name: metrics/graph_connectivity
- name: metrics/hvg_overlap
- name: metrics/isolated_label_asw
- name: metrics/isolated_label_f1
- name: metrics/kbet
- name: metrics/lisi
- name: metrics/pcr
- name: data_processors/transform

runners:
- type: nextflow
Loading

0 comments on commit 0603061

Please sign in to comment.