diff --git a/runscripts/nextflow/config.template b/runscripts/nextflow/config.template index 31417a82f..d84c3fa79 100644 --- a/runscripts/nextflow/config.template +++ b/runscripts/nextflow/config.template @@ -14,25 +14,36 @@ trace { profiles { gcloud { - // Retry once with more RAM if OOM - process.memory = { 4.GB * task.attempt } - // Using single core is slightly slower but much cheaper - process.cpus = 1 - process.executor = 'google-batch' - process.container = params.container_image - // Necessary otherwise symlinks to other files in bucket can break - process.containerOptions = '--volume /mnt/disks/BUCKET:/mnt/disks/BUCKET' process { withLabel: parca { cpus = params.parca_cpus memory = params.parca_cpus * 2.GB } + errorStrategy = { + // Codes: 137 (out-of-memory), 50001 - 50006 (Google Batch task fail: + // https://cloud.google.com/batch/docs/troubleshooting#reserved-exit-codes) + (((task.exitStatus == 137) || (task.exitStatus >= 50001 && task.exitStatus <= 50006)) + && (task.attempt <= process.maxRetries)) ? 'retry' : 'ignore' + } + // Retry once with more RAM if OOM + memory = { 4.GB * task.attempt } + maxRetries = 1 + // Using single core is slightly slower but much cheaper + cpus = 1 + executor = 'google-batch' + container = params.container_image + // Necessary otherwise symlinks to other files in bucket can break + containerOptions = '--volume /mnt/disks/BUCKET:/mnt/disks/BUCKET' + // Check Google Cloud latest spot pricing / performance + machineType = { + def cpus = task.cpus + def powerOf2 = 1 + while (powerOf2 < cpus && powerOf2 < 64) { + powerOf2 *= 2 + } + return "t2d-standard-${powerOf2}" + } } - process.errorStrategy = { - // Codes: 137 (out-of-memory), 50001 - 50006 (Google Batch task fail: - // https://cloud.google.com/batch/docs/troubleshooting#reserved-exit-codes) - (((task.exitStatus == 137) || (task.exitStatus >= 50001 && task.exitStatus <= 50006)) - && (task.attempt <= process.maxRetries)) ? 'retry' : 'ignore' } // For this script to work on a Compute Engine VM, you must // - Set default Compute Engine region and zone for your project // - Set access scope to "Allow full access to all Cloud APIs" when @@ -54,60 +65,51 @@ profiles { google.batch.subnetwork = "regions/${google.location}/subnetworks/default" docker.enabled = true params.projectRoot = '/vEcoli' - process.maxRetries = 1 - // Check Google Cloud latest spot pricing / performance - process.machineType = { - def cpus = task.cpus - def powerOf2 = 1 - while (powerOf2 < cpus && powerOf2 < 64) { - powerOf2 *= 2 - } - return "t2d-standard-${powerOf2}" - } workflow.failOnIgnore = true } sherlock { - process.memory = { - if ( task.exitStatus in [137, 140] ) { - 4.GB * task.attempt - } else { - 4.GB - } - } - process.errorStrategy = { - // Codes: 140 (SLURM job limits), 143 (SLURM preemption) - // Default value for exitStatus is max integer value, this - // is a catch-all for errors that leave no exit code - ((task.exitStatus in [140, 143, Integer.MAX_VALUE]) - && (task.attempt <= process.maxRetries)) ? 'retry' : 'ignore' } - // Using single core is slightly slower but can have shorter - // queue times and is less damaging to future job priority - process.cpus = 1 - process.executor = 'slurm' - process.queue = 'mcovert,owners' - process.container = params.container_image - apptainer.enabled = true process { // Run analyses, create variants, and run ParCa locally with // the job used to launch workflow to avoid long queue times withLabel: short { executor = 'local' - }, + } // ParCa 4 CPUs in ~15 min, 1 CPU in ~30 min, not too bad withLabel: parca { executor = 'local' cpus = 1 memory = 2.GB } - } - process.time = { - if ( task.exitStatus == 140 ) { - 1.h * task.attempt - } else { - 1.h + container = params.container_image + queue = 'mcovert,owners' + executor = 'slurm' + // Single core sims are slightly slower but can have shorter + // queue times and is less damaging to future job priority + cpus = 1 + memory = { + if ( task.exitStatus in [137, 140] ) { + 4.GB * task.attempt + } else { + 4.GB + } } + time = { + if ( task.exitStatus == 140 ) { + 1.h * task.attempt + } else { + 1.h + } + } + errorStrategy = { + // Codes: 140 (SLURM job limits), 143 (SLURM preemption) + // Default value for exitStatus is max integer value, this + // is a catch-all for errors that leave no exit code + ((task.exitStatus in [140, 143, Integer.MAX_VALUE]) + && (task.attempt <= process.maxRetries)) ? 'retry' : 'ignore' + } + maxRetries = 3 } - process.maxRetries = 3 + apptainer.enabled = true params.projectRoot = "${launchDir}" // Avoid getting queue status too frequently (can cause job status mixups) executor.queueStatInterval = '2 min' @@ -127,15 +129,15 @@ profiles { workflow.failOnIgnore = true } standard { - process.executor = 'local' params.projectRoot = "${launchDir}" workflow.failOnIgnore = true - process.errorStrategy = 'ignore' process { withLabel: parca { cpus = params.parca_cpus memory = params.parca_cpus * 2.GB } + executor = 'local' + errorStrategy = 'ignore' } } } diff --git a/runscripts/workflow.py b/runscripts/workflow.py index 6ea04c7fe..f95217dc7 100644 --- a/runscripts/workflow.py +++ b/runscripts/workflow.py @@ -379,9 +379,7 @@ def main(): nf_config = nf_config.replace( "PUBLISH_DIR", os.path.dirname(os.path.dirname(out_uri)) ) - nf_config = nf_config.replace( - "PARCA_CPUS", str(config["parca_options"]["cpus"]) - ) + nf_config = nf_config.replace("PARCA_CPUS", str(config["parca_options"]["cpus"])) # By default, assume running on local device nf_profile = "standard"