From b4a87e2bc68c8ea6385aa071762d82caf3a77f30 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 10 Dec 2024 12:05:34 -0500 Subject: [PATCH] v2.2.0 changes (#17) * v2.2.0 updates * add source field and remove aminoglycosides from rrs --- Dockerfile | 10 +++++----- docs/inputs/theiaprok.md | 2 +- docs/usage.md | 6 +++--- docs/versioning/exhaustive.md | 3 ++- tbp_parser/LIMS.py | 7 +++++++ tbp_parser/Laboratorian.py | 4 ++-- tbp_parser/Row.py | 5 ++++- tbp_parser/Variant.py | 26 +++++++++++++++++++------- tbp_parser/__init__.py | 2 +- tbp_parser/globals.py | 22 ++++++++++++++++------ 10 files changed, 60 insertions(+), 27 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2d5e493..8a95eaf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,9 +3,9 @@ # Shelby Bennett, Erin Young, Curtis Kapsak, & Kutluhan Incekara ARG SAMTOOLS_VER="1.18" -ARG TBP_PARSER_VER="2.1.1" +ARG TBP_PARSER_VER="2.2.0" -FROM ubuntu:jammy as builder +FROM ubuntu:jammy AS builder ARG SAMTOOLS_VER @@ -34,7 +34,7 @@ RUN wget https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VER}/ make install ### start of app stage ### -FROM ubuntu:jammy as app +FROM ubuntu:jammy AS app ARG SAMTOOLS_VER ARG TBP_PARSER_VER @@ -42,7 +42,7 @@ ARG TBP_PARSER_VER LABEL base.image="ubuntu:jammy" LABEL dockerfile.version="1" LABEL software="tbp-parser" -LABEL software.version="2.1.1" +LABEL software.version="2.2.0" LABEL description="tbp-parser and samtools" LABEL website="https://github.com/theiagen/tbp-parser" LABEL license="https://github.com/theiagen/tbp-parser/blob/main/LICENSE" @@ -95,7 +95,7 @@ WORKDIR /data CMD [ "python3", "/tbp-parser/tbp_parser/tbp_parser.py", "--help"] ### start of test stage ### -FROM app as test +FROM app AS test # # install wget for downloading test files RUN apt-get update && apt-get install --no-install-recommends -y wget ca-certificates diff --git a/docs/inputs/theiaprok.md b/docs/inputs/theiaprok.md index a64b0f8..0ebed59 100644 --- a/docs/inputs/theiaprok.md +++ b/docs/inputs/theiaprok.md @@ -29,7 +29,7 @@ The following optional inputs are also available for user modification on Terra: | `merlin_magic` | **tbp_parser_coverage_regions_bed** | File | A BED file containing the regions to calculate percent coverage for | [tbdb-modified-regions.md](https://github.com/theiagen/tbp-parser/blob/main/data/tbdb-modified-regions.bed) | | `merlin_magic` | **tbp_parser_coverage_threshold** | Int | The minimum percentage of a region that has depth above the threshold set by `min_depth` (used for a gene/locus to pass QC) | 100 | | `merlin_magic` | **tbp_parser_debug** | Boolean | Set to `false` to turn off debug mode for `tbp-parser` | `true` | -| `merlin_magic` | **tbp_parser_docker_image** | String | The Docker image to use when running `tbp-parser` | "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.1.1" | +| `merlin_magic` | **tbp_parser_docker_image** | String | The Docker image to use when running `tbp-parser` | "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0" | | `merlin_magic` | **tbp_parser_etha237_frequency** | Float | Minimum frequency for a mutation in ethA at protein position 237 to pass QC in `tbp-parser` | 0.1 | | `merlin_magic` | **tbp_parser_expert_rule_regions_bed** | File | A file that contains the regions where R mutations and expert rules are applied | | | `merlin_magic` | **tbp_parser_min_depth** | Int | Minimum depth for a variant to pass QC in tbp_parser | 10 | diff --git a/docs/usage.md b/docs/usage.md index f06f93c..0f214b6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -9,7 +9,7 @@ title: Getting Started We highly recommend using the following Docker iamge to run tbp-parser: ``` bash -docker pull us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.1.1 #(1)! +docker pull us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0 #(1)! ``` 1. We host our Docker images on the Google Artifact Registry so that they are always availble for usage. @@ -17,11 +17,11 @@ docker pull us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.1.1 #(1)! The entrypoint for this Docker image is the `tbp-parser` help message. To run this container *interactively*, you can use the following command: ``` bash -docker run -it --entrypoint=/bin/bash us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.1.1 +docker run -it --entrypoint=/bin/bash us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0 # Once inside the container interactively, you can run the tbp-parser tool python3 /tbp-parser/tbp_parser/tbp_parser.py -v -# v2.1.1 +# v2.2.0 ``` ### Locally with Python diff --git a/docs/versioning/exhaustive.md b/docs/versioning/exhaustive.md index aa2d1d5..89f65cc 100644 --- a/docs/versioning/exhaustive.md +++ b/docs/versioning/exhaustive.md @@ -54,7 +54,7 @@ The following is a list of every version of `tbp-parser` and a short summary of - v1.5.2 - a mistake; somehow exactly the same as 1.4.4.2?? (this release is also a mystery) - v1.5.3 - make additional language changes and fix an unusual edge case where the same mutation was identified; rename mmpR5 to Rv0678 again - v1.5.4 (same change in v1.4.2.1) - prevent overwriting “R” mutations with No Sequence -- v1.5.5 (same change in v1.4.4.3 - fix an issue where “Pending Retest” was not properly appearing; consider only LIMS genes for LIMS reort +- v1.5.5 (same change in v1.4.4.3) - fix an issue where “Pending Retest” was not properly appearing; consider only LIMS genes for LIMS reort - v1.5.6 (same change in v1.4.4.4) - prevent “Pending Retest” if Insufficient Coverage is in a gene that also has a valid deletion - v1.5.7 (same change in v1.4.4.9) - add optional input to add cycloserine to LIMS report - v1.5.8 (same change in v1.4.4.7) - change tNGS LIMS lineage designation to check items in the coverage dictionary (to represent both rpoB segments; percentage calculation erroneously combined them) @@ -65,6 +65,7 @@ The following is a list of every version of `tbp-parser` and a short summary of - v2.1.0 - any mutations in the 60 proximal promoter regions included in the WHO v2 database (Table 22, page 89-90). *Use either the smw-tbprofiler-updates-dev branch until the time of the v2.3.0 release of TheiaProk on Terra for this and all subsequent v2.1.x+ versions* - Earlier versions are now deprecated and will no longer be supported. - v2.1.1 - adds the source and comment fields from TBDB to the Laboratorian report; fixes a bug where mmpR5 was not being completly renamed to Rv0678; fixes a bug where mutations that didn't share the same position were being compared +- v2.2.0 - removes ciprofloxacin, fluoroquinolones, and ofloxacin from gyrA and gyrB and aminoglycosides from rrs in the `globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME` dictionary; if a drug is missing in the TBProfiler JSON's gene_associated_drug field that is present in that global dictionary, it will be added for the mutation. --- diff --git a/tbp_parser/LIMS.py b/tbp_parser/LIMS.py index beca40d..dcaac46 100644 --- a/tbp_parser/LIMS.py +++ b/tbp_parser/LIMS.py @@ -142,6 +142,7 @@ def apply_lims_rules(self, gene_dictionary, DF_LIMS, max_mdl_resistance, antimic mdl_interpretations = globals.DF_LABORATORIAN[(globals.DF_LABORATORIAN["tbprofiler_gene_name"] == gene) & (globals.DF_LABORATORIAN["antimicrobial"] == antimicrobial_name)]["mdl_interpretation"].tolist() warnings = globals.DF_LABORATORIAN[(globals.DF_LABORATORIAN["tbprofiler_gene_name"] == gene) & (globals.DF_LABORATORIAN["antimicrobial"] == antimicrobial_name)]["warning"].tolist() read_supports = globals.DF_LABORATORIAN[(globals.DF_LABORATORIAN["tbprofiler_gene_name"] == gene) & (globals.DF_LABORATORIAN["antimicrobial"] == antimicrobial_name)]["read_support"].tolist() + tbdb_comments = globals.DF_LABORATORIAN[(globals.DF_LABORATORIAN["tbprofiler_gene_name"] == gene) & (globals.DF_LABORATORIAN["antimicrobial"] == antimicrobial_name)]["tbdb_comment"].tolist() self.logger.debug("LIMS:The following mutations belong to this gene ({}) and are associated with this drug ({})".format(gene, antimicrobial_name)) self.logger.debug("LIMS:Amino acid mutations: {}".format(aa_mutations_per_gene)) @@ -268,6 +269,12 @@ def apply_lims_rules(self, gene_dictionary, DF_LIMS, max_mdl_resistance, antimic if mutation_type == "synonymous_variant": substitution = "{} [synonymous]".format(substitution) + # add a high-level or low-level resistance annotation to the LIMS report + for key, value in globals.COMMENT_TO_LIMS.items(): + if key in tbdb_comments[index]: + substitution = "{} [{}]".format(substitution, value) + break + # add the formatted mutation to mutations_per_gene dictionary (formatted as a list) if gene not in mutations_per_gene.keys(): mutations_per_gene[gene] = [substitution] diff --git a/tbp_parser/Laboratorian.py b/tbp_parser/Laboratorian.py index f080ebf..d3f9815 100644 --- a/tbp_parser/Laboratorian.py +++ b/tbp_parser/Laboratorian.py @@ -119,10 +119,10 @@ def create_laboratorian_report(self): for gene, antimicrobial_drug_names in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME.items(): for drug_name in antimicrobial_drug_names: if gene not in globals.GENES_REPORTED: - self.logger.debug("LAB:Gene {} not in report, now adding it to the report".format(gene)) + self.logger.debug("LAB:Gene {} with antimicrobial {} not in report, now adding it to the report".format(gene, drug_name)) row_list.append(Row(self.logger, None, "NA", drug_name, gene)) else: - self.logger.debug("LAB:Gene {} already in report".format(gene)) + self.logger.debug("LAB:Gene {} with antimicrobial {} already in report".format(gene, drug_name)) # make a list to add QC fail rows to end of laboratorian report reorder_list = [] diff --git a/tbp_parser/Row.py b/tbp_parser/Row.py index c5e888a..cf8498e 100644 --- a/tbp_parser/Row.py +++ b/tbp_parser/Row.py @@ -283,7 +283,7 @@ def complete_row(self): self.logger.debug("ROW:Interpretation logic applied or skipped; now removing any 'noexpert' suffixes") self.describe_rationale() - + self.logger.debug("rationale = {}".format(self.rationale)) self.logger.debug("ROW:Finished completing the row's values, now exiting function") def rank_annotation(self): @@ -328,7 +328,10 @@ def describe_rationale(self): if any(rule in self.mdl_interpretation for rule in globals.RULE_TO_RATIONALE.keys()): interpretation = self.mdl_interpretation[0] rule = self.mdl_interpretation[1:] + self.logger.debug("rule={}".format(rule)) self.mdl_interpretation = interpretation + self.logger.debug(globals.RULE_TO_RATIONALE[rule]) + globals.RULE_TO_RATIONALE[rule] self.rationale = globals.RULE_TO_RATIONALE[rule] self.confidence = "No WHO annotation" \ No newline at end of file diff --git a/tbp_parser/Variant.py b/tbp_parser/Variant.py index ee5e972..6ff693d 100644 --- a/tbp_parser/Variant.py +++ b/tbp_parser/Variant.py @@ -111,8 +111,15 @@ def extract_annotations(self): for drug in gene_associated_drug_list: self.logger.debug("VAR:The drug ({}) was not found in the annotation dictionary; adding it with a WHO confidence of 'No WHO annotation'".format(drug)) self.annotation_dictionary[drug] = Row(self.logger, self, "No WHO annotation", drug) - + + if self.gene_name in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME.keys(): + for antimicrobial in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME[self.gene_name]: + if antimicrobial not in self.annotation_dictionary.keys(): + self.logger.debug("VAR: The drug ({}) was not found in the gene associated drug list; adding it with a WHO confidence of 'No WHO annotation'".format(antimicrobial)) + self.annotation_dictionary[antimicrobial] = Row(self.logger, self, "No WHO annotation", antimicrobial, source="Mutation effect for given drug is not in TBDB") + self.logger.debug("VAR:The annotation dictionary has all gene associated drugs included; it now has a length of {}".format(len(self.annotation_dictionary))) + else: # possibilities 1b and 2: the annotation field has no content or the field does not exist self.logger.debug("VAR:The annotation field has no content or does not exist. Now iterating through gene associated drugs.") @@ -120,6 +127,11 @@ def extract_annotations(self): for drug in self.gene_associated_drugs: self.annotation_dictionary[drug] = Row(self.logger, self, "No WHO annotation", drug) + if self.gene_name in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME.keys(): + for antimicrobial in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME[self.gene_name]: + if antimicrobial not in self.annotation_dictionary.keys(): + self.annotation_dictionary[antimicrobial] = Row(self.logger, self, "No WHO annotation", antimicrobial) + self.logger.debug("VAR:After iterating through gene_associated_drugs, the annotation dictionary has a length of {}".format(len(self.annotation_dictionary))) self.logger.debug("VAR:Annotations extracted, now exiting function") @@ -141,7 +153,7 @@ def apply_expert_rules(self, interpretation_destination): # check if position within promoter regions if self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" # otherwise, check if it is an upstream gene variant if "upstream_gene_variant" in self.type: @@ -168,7 +180,7 @@ def apply_expert_rules(self, interpretation_destination): # check if position within promoter regions if self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" else: self.logger.debug("VAR:The position is not within the special positions or the proximal promoter region; interpretation is 'S/U'") @@ -192,7 +204,7 @@ def apply_expert_rules(self, interpretation_destination): # check if position within promoter regions if self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" elif ("upstream_gene_variant" in self.type): self.logger.debug("VAR:The mutation type is a NONsynonymous variant, not in the proximal promoter region, and is an upstream gene variant; interpretation is 'S/U'") @@ -230,7 +242,7 @@ def apply_expert_rules(self, interpretation_destination): if self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" elif ("upstream_gene_variant" in self.type): self.logger.debug("VAR:The position is not within the special positions, not in the proximal promoter region, is nonsynomyous but is an upstream gene variant; interpretation is 'S/U'") @@ -255,7 +267,7 @@ def apply_expert_rules(self, interpretation_destination): elif self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" else: self.logger.debug("VAR:The position is not within the special positions, and not in the proximal promoter region,; interpretation is 'S/U'") @@ -267,7 +279,7 @@ def apply_expert_rules(self, interpretation_destination): return "Srule3.2.4" elif self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" elif ("upstream_gene_variant" in self.type): self.logger.debug("VAR:The mutation is a nonsynonymous variant and not in the proximal promoter region, but is an upstream gene variant; interpretation is 'S/U'") return "Srule3.2.4" if interpretation_destination == "mdl" else "Urule3.2.4" diff --git a/tbp_parser/__init__.py b/tbp_parser/__init__.py index be0fcb3..424bd53 100644 --- a/tbp_parser/__init__.py +++ b/tbp_parser/__init__.py @@ -1 +1 @@ -__VERSION__ = "v2.1.1" \ No newline at end of file +__VERSION__ = "v2.2.0" \ No newline at end of file diff --git a/tbp_parser/globals.py b/tbp_parser/globals.py index 6955507..316789a 100644 --- a/tbp_parser/globals.py +++ b/tbp_parser/globals.py @@ -282,6 +282,17 @@ def is_within_range(position, range_positions): "streptomycin": ["gid", "rpsL", "rrs"] } +""" +A dictionary that converts the mutation comment to a reportable LIMS values for the corresponding mutation +The key is the search term in the comment (comment.contains(key)) and the value will be output in square brackets +""" +global COMMENT_TO_LIMS +COMMENT_TO_LIMS = { + "Low-level resistance (multiple, genetically linked low-level resistance mutations are additive and confer high-level resistance)": "Low-level resistance", + "High-level resistance": "High-level resistance" +} + + """ A dictionary that will contain the percent coverage for each gene """ @@ -365,8 +376,8 @@ def is_within_range(position, range_positions): "folC": ["para-aminosalicylic_acid"], "fprA": ["amikacin", "capreomycin"], "gid": ["streptomycin"], - "gyrA": ["ciprofloxacin", "fluoroquinolones", "levofloxacin", "moxifloxacin", "ofloxacin"], - "gyrB": ["ciprofloxacin", "fluoroquinolones", "levofloxacin", "moxifloxacin", "ofloxacin"], + "gyrA": ["levofloxacin", "moxifloxacin"], + "gyrB": ["levofloxacin", "moxifloxacin"], "inhA": ["ethionamide", "isoniazid"], "kasA": ["isoniazid"], "katG": ["isoniazid"], @@ -386,7 +397,7 @@ def is_within_range(position, range_positions): "rpsA": ["pyrazinamide"], "rpsL": ["streptomycin"], "rrl": ["linezolid"], - "rrs": ["amikacin", "aminoglycosides", "capreomycin", "kanamycin", "linezolid"], + "rrs": ["amikacin", "capreomycin", "kanamycin", "linezolid"], "Rv0678": ["bedaquiline", "clofazimine"], "Rv1258c": ["isoniazid", "pyrazinamide", "streptomycin"], "Rv1979c": ["bedaquiline", "clofazimine"], @@ -550,8 +561,7 @@ def is_within_range(position, range_positions): """ global LINEAGE_ENGLISH LINEAGE_ENGLISH = "" - - + """ A list that will contain the names of genes that have coverages below the coverage threshold (see also COVERAGE_THRESHOLD) @@ -745,7 +755,7 @@ def is_within_range(position, range_positions): "rule3.2.2": "Expert rule 3.2.3. gyrA QRDR", "rule3.2.3": "Expert rule 3.2.3. gyrB QRDR", "rule3.2.4": "No WHO annotation or expert rule", - "rule6": "Mutation in proximal promoter region" + "whov2": "Mutation in proximal promoter region" } """