src/schema/workflow_execution_activity.yaml

id: https://w3id.org/nmdc/workflow_execution_activity
name: NMDC-Workflow-Execution
title: Workflow Execution Activities module for NMDC Schema

license: https://creativecommons.org/publicdomain/zero/1.0/

imports:
  - core

prefixes:
  nmdc: https://w3id.org/nmdc/
  linkml: https://w3id.org/linkml/

default_prefix: nmdc
default_range: string

classes:
  MetagenomeAssembly:
    class_uri: nmdc:MetagenomeAssembly
    description: A workflow execution activity that converts sequencing reads into an assembled metagenome.
    comments:
      - instances of this class may use a de novo assembly strategy in most or all cases relevant to NMDC
    is_a: WorkflowExecution
    in_subset:
      - workflow subset
    slots:
      - asm_score
      - scaffolds
      - scaf_logsum
      - scaf_powsum
      - scaf_max
      - scaf_bp
      - scaf_n50
      - scaf_n90
      - scaf_l50
      - scaf_l90
      - scaf_n_gt50k
      - scaf_l_gt50k
      - scaf_pct_gt50k
      - contigs
      - contig_bp
      - ctg_n50
      - ctg_l50
      - ctg_n90
      - ctg_l90
      - ctg_logsum
      - ctg_powsum
      - ctg_max
      - gap_pct
      - gc_std
      - gc_avg
      - num_input_reads
      - num_aligned_reads
      - insdc_assembly_identifiers
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfmgas-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgns)-{id_shoulder}-{id_blade}$"
          interpolated: true

  MetatranscriptomeAssembly:
    class_uri: nmdc:MetatranscriptomeAssembly
    is_a: WorkflowExecution
    in_subset:
      - workflow subset
    slots:
      - asm_score
      - scaffolds
      - scaf_logsum
      - scaf_powsum
      - scaf_max
      - scaf_bp
      - scaf_n50
      - scaf_n90
      - scaf_l50
      - scaf_l90
      - scaf_n_gt50k
      - scaf_l_gt50k
      - scaf_pct_gt50k
      - contigs
      - contig_bp
      - ctg_n50
      - ctg_l50
      - ctg_n90
      - ctg_l90
      - ctg_logsum
      - ctg_powsum
      - ctg_max
      - gap_pct
      - gc_std
      - gc_avg
      - num_input_reads
      - num_aligned_reads
      - insdc_assembly_identifiers
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfmtas-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgns)-{id_shoulder}-{id_blade}$"
          interpolated: true

  MetatranscriptomeAnnotation:
    class_uri: nmdc:MetatranscriptomeAnnotation
    is_a: WorkflowExecution
    slots:
      - img_identifiers
      - gold_analysis_project_identifiers
    in_subset:
      - workflow subset
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfmtan-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      img_identifiers:
        maximum_cardinality: 1
      has_input:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(dobj)-{id_shoulder}-{id_blade}$"
          interpolated: true
      has_output:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(dobj)-{id_shoulder}-{id_blade}$"
          interpolated: true
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgns)-{id_shoulder}-{id_blade}$"
          interpolated: true
      gold_analysis_project_identifiers:
        structured_pattern:
          syntax: "^gold:Ga[0-9]+$"
          interpolated: true

  MetatranscriptomeExpressionAnalysis:
    is_a: WorkflowExecution
    description: >-
      A workflow process that provides expression values and read counts for gene features predicted on the contigs.
    in_subset:
      - workflow subset
    slots:
      - img_identifiers
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfmtex-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      img_identifiers:
        maximum_cardinality: 1
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgns)-{id_shoulder}-{id_blade}$"
          interpolated: true
    class_uri: nmdc:MetatranscriptomeExpressionAnalysis


  MagsAnalysis:
    class_uri: nmdc:MagsAnalysis
    description: A workflow execution activity that uses computational binning tools to group assembled contigs into genomes
    title: Metagenome-Assembled Genome analysis activity
    is_a: WorkflowExecution
    in_subset:
      - workflow subset
    slots:
      - binned_contig_num
      - input_contig_num
      - low_depth_contig_num
      - mags_list
      - too_short_contig_num
      - unbinned_contig_num
      - img_identifiers
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfmag-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      img_identifiers:
        maximum_cardinality: 1
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgns)-{id_shoulder}-{id_blade}$"
          interpolated: true


  MetagenomeSequencing:
    class_uri: nmdc:MetagenomeSequencing
    description: Initial sequencing activity that precedes any analysis.  This activity has output(s) that are the raw sequencing data.
    title: Metagenome sequencing activity
    is_a: WorkflowExecution
    in_subset:
      - workflow subset
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfmsa-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      has_input:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(bsm|procsm)-{id_shoulder}-{id_blade}$"
          interpolated: true
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgns)-{id_shoulder}-{id_blade}$"
          interpolated: true

  ReadQcAnalysis:
    class_uri: nmdc:ReadQcAnalysis
    description: A workflow execution activity that performs quality control on raw Illumina reads including quality trimming, artifact removal, linker trimming, adapter trimming, spike-in removal, and human/cat/dog/mouse/microbe contaminant removal
    title: Read quality control analysis activity
    is_a: WorkflowExecution
    in_subset:
      - workflow subset
    slots:
      - input_base_count
      - input_read_bases
      - input_read_count
      - output_base_count
      - output_read_bases
      - output_read_count
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfrqc-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgns)-{id_shoulder}-{id_blade}$"
          interpolated: true

  ReadBasedTaxonomyAnalysis:
    class_uri: nmdc:ReadBasedTaxonomyAnalysis
    description: A workflow execution activity that performs taxonomy classification using sequencing reads
    title: Read based analysis activity
    is_a: WorkflowExecution
    in_subset:
      - workflow subset
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfrbt-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgns)-{id_shoulder}-{id_blade}$"
          interpolated: true

  MetabolomicsAnalysis:
    class_uri: nmdc:MetabolomicsAnalysis
    is_a: WorkflowExecution
    in_subset:
      - workflow subset
    slots:
      - has_calibration
      - has_metabolite_identifications
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfmb-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgms)-{id_shoulder}-{id_blade}$"
          interpolated: true

  MetaproteomicsAnalysis:
    class_uri: nmdc:MetaproteomicsAnalysis
    is_a: WorkflowExecution
    in_subset:
      - workflow subset
    slots:
      - has_peptide_quantifications
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfmp-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgms)-{id_shoulder}-{id_blade}$"
          interpolated: true

  NomAnalysis:
    class_uri: nmdc:NomAnalysis
    is_a: WorkflowExecution
    in_subset:
      - workflow subset
    slots:
      - has_calibration
    slot_usage:
      id:
        required: true
        structured_pattern:
          syntax: "{id_nmdc_prefix}:wfnom-{id_shoulder}-{id_blade}{id_version}$"
          interpolated: true
      was_informed_by:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:(omprc|dgms)-{id_shoulder}-{id_blade}$"
          interpolated: true

  CalibrationInformation:
    class_uri: nmdc:CalibrationInformation
    is_a: InformationObject
    description: A calibration object that is associated with a process.
    slots:
      - calibration_object
      - internal_calibration
      - calibration_target
      - calibration_standard
    rules:
      - title: calibration_standard_if_rt
        description: >-
          If the calibration_target is retention_index, a calibration_standard is required.
        preconditions:
          slot_conditions:
            calibration_target:
              equals_string: retention_index
        postconditions:
          slot_conditions:
            calibration_standard:
              required: true
      - title: calibration_object_if_not_internal_calibration
        description: >-
          If internal_calibration is false, a calibration_object is required.
        preconditions:
          slot_conditions:
            internal_calibration: false
        postconditions:
          slot_conditions:
            calibration_object:
              required: true
    slot_usage:
      internal_calibration:
        required: true
      calibration_target:
        required: true
      id:
        structured_pattern:
          syntax: "{id_nmdc_prefix}:calib-{id_shoulder}-{id_blade}$"
          interpolated: true

slots:

  metagenome_assembly_parameter:
    abstract: true

  has_peptide_quantifications:
    range: PeptideQuantification
    multivalued: true
    inlined_as_list: true

  asm_score:
    is_a: metagenome_assembly_parameter
    description: >-
      A score for comparing metagenomic assembly quality from same sample.
    range: float

  scaffolds:
    is_a: metagenome_assembly_parameter
    description: >-
      Total sequence count of all scaffolds.
    range: float

  scaf_logsum:
    is_a: metagenome_assembly_parameter
    description: >-
      The sum of the (length*log(length)) of all scaffolds, times some constant.  Increase the contiguity, the score will increase
    range: float

  scaf_powsum:
    is_a: metagenome_assembly_parameter
    description: >-
      Powersum of all scaffolds is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).
    range: float

  scaf_max:
    is_a: metagenome_assembly_parameter
    description: >-
      Maximum scaffold length.
    range: float

  scaf_bp:
    is_a: metagenome_assembly_parameter
    description: >-
      Total size in bp of all scaffolds.
    range: float

  scaf_n50:
    is_a: metagenome_assembly_parameter
    description: >-
      Given a set of scaffolds, each with its own length, the N50 count is defined as the smallest number of scaffolds whose length sum makes up half of genome size.
    range: float

  scaf_n90:
    is_a: metagenome_assembly_parameter
    description: >-
      Given a set of scaffolds, each with its own length, the N90 count is defined as the smallest number of scaffolds whose length sum makes up 90% of genome size.
    range: float

  scaf_l50:
    is_a: metagenome_assembly_parameter
    description: >-
      Given a set of scaffolds, the L50 is defined as the sequence length of the shortest scaffold at 50% of the total genome length.
    range: float

  scaf_l90:
    is_a: metagenome_assembly_parameter
    description: >-
      The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all scaffolds of that length or longer contains at least 90% of the sum of the lengths of all scaffolds.
    range: float

  scaf_n_gt50k:
    is_a: metagenome_assembly_parameter
    description: >-
      Total sequence count of scaffolds greater than 50 KB.
    range: float

  scaf_l_gt50k:
    is_a: metagenome_assembly_parameter
    description: >-
      Total size in bp of all scaffolds greater than 50 KB.
    range: float

  scaf_pct_gt50k:
    is_a: metagenome_assembly_parameter
    description: >-
      Total sequence size percentage of scaffolds greater than 50 KB.
    range: float

  contigs:
    is_a: metagenome_assembly_parameter
    description: >-
      The sum of the (length*log(length)) of all contigs, times some constant.  Increase the contiguity, the score will increase
    range: float

  contig_bp:
    is_a: metagenome_assembly_parameter
    description: >-
      Total size in bp of all contigs.
    range: float

  ctg_n50:
    is_a: metagenome_assembly_parameter
    description: >-
      Given a set of contigs, each with its own length, the N50 count is defined as the smallest number_of_contigs whose length sum makes up half of genome size.
    range: float

  ctg_l50:
    is_a: metagenome_assembly_parameter
    description: >-
      Given a set of contigs, the L50 is defined as the sequence length of the shortest contig at 50% of the total genome length.
    range: float

  ctg_n90:
    is_a: metagenome_assembly_parameter
    description: >-
      Given a set of contigs, each with its own length, the N90 count is defined as the smallest number of contigs whose length sum makes up 90% of genome size.
    range: float

  ctg_l90:
    is_a: metagenome_assembly_parameter
    description: >-
      The L90 statistic is less than or equal to the L50 statistic; it is the length for which the collection of all contigs of that length or longer contains at least 90% of the sum of the lengths of all contigs.
    range: float

  ctg_logsum:
    is_a: metagenome_assembly_parameter
    description: >-
      Maximum contig length.
    range: float

  ctg_powsum:
    is_a: metagenome_assembly_parameter
    description: >-
      Powersum of all contigs is the same as logsum except that it uses the sum of (length*(length^P)) for some power P (default P=0.25).
    range: float

  ctg_max:
    is_a: metagenome_assembly_parameter
    description: >-
      Maximum contig length.
    range: float

  gap_pct:
    is_a: metagenome_assembly_parameter
    description: >-
      The gap size percentage of all scaffolds.
    range: float

  gc_std:
    is_a: metagenome_assembly_parameter
    description: >-
      Standard deviation of GC content of all contigs.
    range: float

  gc_avg:
    is_a: metagenome_assembly_parameter
    description: >-
      Average of GC content of all contigs.
    range: float

  num_input_reads:
    is_a: metagenome_assembly_parameter
    description: >-
      The sequence count number of input reads for assembly.
    range: float

  num_aligned_reads:
    is_a: metagenome_assembly_parameter
    description: >-
      The sequence count number of input reads aligned to assembled contigs.
    range: float

  read_qc_analysis_statistic:
    abstract: true

  mags_list:
    range: MagBin
    multivalued: true
    inlined_as_list: true

  too_short_contig_num:
    range: integer

  binned_contig_num:
    range: integer

  input_contig_num:
    range: integer

  unbinned_contig_num:
    range: integer

  low_depth_contig_num:
    range: integer

  input_read_count:
    is_a: read_qc_analysis_statistic
    description: >-
      The sequence count number of input reads for QC analysis.
    range: float

  input_base_count:
    is_a: read_qc_analysis_statistic
    description: >-
      The nucleotide base count number of input reads for QC analysis.
    range: float

  output_read_count:
    is_a: read_qc_analysis_statistic
    description: >-
      After QC analysis sequence count number.
    range: float

  output_base_count:
    is_a: read_qc_analysis_statistic
    description: >-
      After QC analysis nucleotide base count number.
    range: float

  output_read_bases:
    range: float
    description: >-
      TODO

  input_read_bases:
    range: float
    description: >-
      TODO      

  has_calibration:
    any_of:
      - range: CalibrationInformation
      - range: string
    description: a calibration instance associated with a process
    notes: >-
      has_calibration slot will be removed from all WorkflowExecution classes but remain on the
      MassSpectrometry class after an ingest of the appropriate set has occurred. 
      Once this has occurred, this slot's range can be updated to CalibrationInformation and class/slot definitions can move to nmdc.yaml.  
      See PR #29 in Berkeley schema.

  calibration_object:
    range: DataObject
    description: the file containing calibration data object

  internal_calibration:
    range: boolean
    description: whether internal calibration was used, if false, external calibration was used

  calibration_target:
    range: CalibrationTargetEnum
    description: the target measurement of the calibration

  calibration_standard:
    range: CalibrationStandardEnum
    description: the reference standard(s) used for calibration

  has_metabolite_identifications:
    range: MetaboliteIdentification
    multivalued: true
    inlined_as_list: true

enums:
  CalibrationTargetEnum:
    permissible_values:
      mass_charge_ratio:
        title: m/z
        aliases:
          - Mass
          - m/z
      retention_time:
        aliases:
          - RT
      retention_index:
        aliases:
          - RI

  CalibrationStandardEnum:
    permissible_values:
      fames:
        aliases:
          - FAMES
      alkanes:
        aliases:
          - Alkanes