From 7ba2aa108384c541ae0e779ca36a046617caa214 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 9 Dec 2024 22:33:24 +0000 Subject: [PATCH 1/6] v2.2.0 updates --- Dockerfile | 10 +++++----- docs/inputs/theiaprok.md | 2 +- docs/usage.md | 6 +++--- docs/versioning/exhaustive.md | 3 ++- tbp_parser/LIMS.py | 7 +++++++ tbp_parser/Laboratorian.py | 4 ++-- tbp_parser/Row.py | 5 ++++- tbp_parser/Variant.py | 26 +++++++++++++++++++------- tbp_parser/__init__.py | 2 +- tbp_parser/globals.py | 20 +++++++++++++++----- 10 files changed, 59 insertions(+), 26 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2d5e493..8a95eaf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,9 +3,9 @@ # Shelby Bennett, Erin Young, Curtis Kapsak, & Kutluhan Incekara ARG SAMTOOLS_VER="1.18" -ARG TBP_PARSER_VER="2.1.1" +ARG TBP_PARSER_VER="2.2.0" -FROM ubuntu:jammy as builder +FROM ubuntu:jammy AS builder ARG SAMTOOLS_VER @@ -34,7 +34,7 @@ RUN wget https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VER}/ make install ### start of app stage ### -FROM ubuntu:jammy as app +FROM ubuntu:jammy AS app ARG SAMTOOLS_VER ARG TBP_PARSER_VER @@ -42,7 +42,7 @@ ARG TBP_PARSER_VER LABEL base.image="ubuntu:jammy" LABEL dockerfile.version="1" LABEL software="tbp-parser" -LABEL software.version="2.1.1" +LABEL software.version="2.2.0" LABEL description="tbp-parser and samtools" LABEL website="https://github.com/theiagen/tbp-parser" LABEL license="https://github.com/theiagen/tbp-parser/blob/main/LICENSE" @@ -95,7 +95,7 @@ WORKDIR /data CMD [ "python3", "/tbp-parser/tbp_parser/tbp_parser.py", "--help"] ### start of test stage ### -FROM app as test +FROM app AS test # # install wget for downloading test files RUN apt-get update && apt-get install --no-install-recommends -y wget ca-certificates diff --git a/docs/inputs/theiaprok.md b/docs/inputs/theiaprok.md index a64b0f8..0ebed59 100644 --- a/docs/inputs/theiaprok.md +++ b/docs/inputs/theiaprok.md @@ -29,7 +29,7 @@ The following optional inputs are also available for user modification on Terra: | `merlin_magic` | **tbp_parser_coverage_regions_bed** | File | A BED file containing the regions to calculate percent coverage for | [tbdb-modified-regions.md](https://github.com/theiagen/tbp-parser/blob/main/data/tbdb-modified-regions.bed) | | `merlin_magic` | **tbp_parser_coverage_threshold** | Int | The minimum percentage of a region that has depth above the threshold set by `min_depth` (used for a gene/locus to pass QC) | 100 | | `merlin_magic` | **tbp_parser_debug** | Boolean | Set to `false` to turn off debug mode for `tbp-parser` | `true` | -| `merlin_magic` | **tbp_parser_docker_image** | String | The Docker image to use when running `tbp-parser` | "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.1.1" | +| `merlin_magic` | **tbp_parser_docker_image** | String | The Docker image to use when running `tbp-parser` | "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0" | | `merlin_magic` | **tbp_parser_etha237_frequency** | Float | Minimum frequency for a mutation in ethA at protein position 237 to pass QC in `tbp-parser` | 0.1 | | `merlin_magic` | **tbp_parser_expert_rule_regions_bed** | File | A file that contains the regions where R mutations and expert rules are applied | | | `merlin_magic` | **tbp_parser_min_depth** | Int | Minimum depth for a variant to pass QC in tbp_parser | 10 | diff --git a/docs/usage.md b/docs/usage.md index f06f93c..0f214b6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -9,7 +9,7 @@ title: Getting Started We highly recommend using the following Docker iamge to run tbp-parser: ``` bash -docker pull us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.1.1 #(1)! +docker pull us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0 #(1)! ``` 1. We host our Docker images on the Google Artifact Registry so that they are always availble for usage. @@ -17,11 +17,11 @@ docker pull us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.1.1 #(1)! The entrypoint for this Docker image is the `tbp-parser` help message. To run this container *interactively*, you can use the following command: ``` bash -docker run -it --entrypoint=/bin/bash us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.1.1 +docker run -it --entrypoint=/bin/bash us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0 # Once inside the container interactively, you can run the tbp-parser tool python3 /tbp-parser/tbp_parser/tbp_parser.py -v -# v2.1.1 +# v2.2.0 ``` ### Locally with Python diff --git a/docs/versioning/exhaustive.md b/docs/versioning/exhaustive.md index aa2d1d5..93e98d6 100644 --- a/docs/versioning/exhaustive.md +++ b/docs/versioning/exhaustive.md @@ -54,7 +54,7 @@ The following is a list of every version of `tbp-parser` and a short summary of - v1.5.2 - a mistake; somehow exactly the same as 1.4.4.2?? (this release is also a mystery) - v1.5.3 - make additional language changes and fix an unusual edge case where the same mutation was identified; rename mmpR5 to Rv0678 again - v1.5.4 (same change in v1.4.2.1) - prevent overwriting “R” mutations with No Sequence -- v1.5.5 (same change in v1.4.4.3 - fix an issue where “Pending Retest” was not properly appearing; consider only LIMS genes for LIMS reort +- v1.5.5 (same change in v1.4.4.3) - fix an issue where “Pending Retest” was not properly appearing; consider only LIMS genes for LIMS reort - v1.5.6 (same change in v1.4.4.4) - prevent “Pending Retest” if Insufficient Coverage is in a gene that also has a valid deletion - v1.5.7 (same change in v1.4.4.9) - add optional input to add cycloserine to LIMS report - v1.5.8 (same change in v1.4.4.7) - change tNGS LIMS lineage designation to check items in the coverage dictionary (to represent both rpoB segments; percentage calculation erroneously combined them) @@ -65,6 +65,7 @@ The following is a list of every version of `tbp-parser` and a short summary of - v2.1.0 - any mutations in the 60 proximal promoter regions included in the WHO v2 database (Table 22, page 89-90). *Use either the smw-tbprofiler-updates-dev branch until the time of the v2.3.0 release of TheiaProk on Terra for this and all subsequent v2.1.x+ versions* - Earlier versions are now deprecated and will no longer be supported. - v2.1.1 - adds the source and comment fields from TBDB to the Laboratorian report; fixes a bug where mmpR5 was not being completly renamed to Rv0678; fixes a bug where mutations that didn't share the same position were being compared +- v2.2.0 - removes ciprofloxacin, fluoroquinolones, and ofloxacin from gyrA and gyrB in the `globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME` dictionary; if a drug is missing in the TBProfiler JSON's gene_associated_drug field that is present in that global dictionary, it will be added for the mutation. --- diff --git a/tbp_parser/LIMS.py b/tbp_parser/LIMS.py index beca40d..dcaac46 100644 --- a/tbp_parser/LIMS.py +++ b/tbp_parser/LIMS.py @@ -142,6 +142,7 @@ def apply_lims_rules(self, gene_dictionary, DF_LIMS, max_mdl_resistance, antimic mdl_interpretations = globals.DF_LABORATORIAN[(globals.DF_LABORATORIAN["tbprofiler_gene_name"] == gene) & (globals.DF_LABORATORIAN["antimicrobial"] == antimicrobial_name)]["mdl_interpretation"].tolist() warnings = globals.DF_LABORATORIAN[(globals.DF_LABORATORIAN["tbprofiler_gene_name"] == gene) & (globals.DF_LABORATORIAN["antimicrobial"] == antimicrobial_name)]["warning"].tolist() read_supports = globals.DF_LABORATORIAN[(globals.DF_LABORATORIAN["tbprofiler_gene_name"] == gene) & (globals.DF_LABORATORIAN["antimicrobial"] == antimicrobial_name)]["read_support"].tolist() + tbdb_comments = globals.DF_LABORATORIAN[(globals.DF_LABORATORIAN["tbprofiler_gene_name"] == gene) & (globals.DF_LABORATORIAN["antimicrobial"] == antimicrobial_name)]["tbdb_comment"].tolist() self.logger.debug("LIMS:The following mutations belong to this gene ({}) and are associated with this drug ({})".format(gene, antimicrobial_name)) self.logger.debug("LIMS:Amino acid mutations: {}".format(aa_mutations_per_gene)) @@ -268,6 +269,12 @@ def apply_lims_rules(self, gene_dictionary, DF_LIMS, max_mdl_resistance, antimic if mutation_type == "synonymous_variant": substitution = "{} [synonymous]".format(substitution) + # add a high-level or low-level resistance annotation to the LIMS report + for key, value in globals.COMMENT_TO_LIMS.items(): + if key in tbdb_comments[index]: + substitution = "{} [{}]".format(substitution, value) + break + # add the formatted mutation to mutations_per_gene dictionary (formatted as a list) if gene not in mutations_per_gene.keys(): mutations_per_gene[gene] = [substitution] diff --git a/tbp_parser/Laboratorian.py b/tbp_parser/Laboratorian.py index f080ebf..d3f9815 100644 --- a/tbp_parser/Laboratorian.py +++ b/tbp_parser/Laboratorian.py @@ -119,10 +119,10 @@ def create_laboratorian_report(self): for gene, antimicrobial_drug_names in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME.items(): for drug_name in antimicrobial_drug_names: if gene not in globals.GENES_REPORTED: - self.logger.debug("LAB:Gene {} not in report, now adding it to the report".format(gene)) + self.logger.debug("LAB:Gene {} with antimicrobial {} not in report, now adding it to the report".format(gene, drug_name)) row_list.append(Row(self.logger, None, "NA", drug_name, gene)) else: - self.logger.debug("LAB:Gene {} already in report".format(gene)) + self.logger.debug("LAB:Gene {} with antimicrobial {} already in report".format(gene, drug_name)) # make a list to add QC fail rows to end of laboratorian report reorder_list = [] diff --git a/tbp_parser/Row.py b/tbp_parser/Row.py index c5e888a..cf8498e 100644 --- a/tbp_parser/Row.py +++ b/tbp_parser/Row.py @@ -283,7 +283,7 @@ def complete_row(self): self.logger.debug("ROW:Interpretation logic applied or skipped; now removing any 'noexpert' suffixes") self.describe_rationale() - + self.logger.debug("rationale = {}".format(self.rationale)) self.logger.debug("ROW:Finished completing the row's values, now exiting function") def rank_annotation(self): @@ -328,7 +328,10 @@ def describe_rationale(self): if any(rule in self.mdl_interpretation for rule in globals.RULE_TO_RATIONALE.keys()): interpretation = self.mdl_interpretation[0] rule = self.mdl_interpretation[1:] + self.logger.debug("rule={}".format(rule)) self.mdl_interpretation = interpretation + self.logger.debug(globals.RULE_TO_RATIONALE[rule]) + globals.RULE_TO_RATIONALE[rule] self.rationale = globals.RULE_TO_RATIONALE[rule] self.confidence = "No WHO annotation" \ No newline at end of file diff --git a/tbp_parser/Variant.py b/tbp_parser/Variant.py index ee5e972..064d339 100644 --- a/tbp_parser/Variant.py +++ b/tbp_parser/Variant.py @@ -111,8 +111,15 @@ def extract_annotations(self): for drug in gene_associated_drug_list: self.logger.debug("VAR:The drug ({}) was not found in the annotation dictionary; adding it with a WHO confidence of 'No WHO annotation'".format(drug)) self.annotation_dictionary[drug] = Row(self.logger, self, "No WHO annotation", drug) - + + if self.gene_name in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME.keys(): + for antimicrobial in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME[self.gene_name]: + if antimicrobial not in self.annotation_dictionary.keys(): + self.logger.debug("VAR: The drug ({}) was not found in the gene associated drug list; adding it with a WHO confidence of 'No WHO annotation'".format(antimicrobial)) + self.annotation_dictionary[antimicrobial] = Row(self.logger, self, "No WHO annotation", antimicrobial) + self.logger.debug("VAR:The annotation dictionary has all gene associated drugs included; it now has a length of {}".format(len(self.annotation_dictionary))) + else: # possibilities 1b and 2: the annotation field has no content or the field does not exist self.logger.debug("VAR:The annotation field has no content or does not exist. Now iterating through gene associated drugs.") @@ -120,6 +127,11 @@ def extract_annotations(self): for drug in self.gene_associated_drugs: self.annotation_dictionary[drug] = Row(self.logger, self, "No WHO annotation", drug) + if self.gene_name in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME.keys(): + for antimicrobial in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME[self.gene_name]: + if antimicrobial not in self.annotation_dictionary.keys(): + self.annotation_dictionary[antimicrobial] = Row(self.logger, self, "No WHO annotation", antimicrobial) + self.logger.debug("VAR:After iterating through gene_associated_drugs, the annotation dictionary has a length of {}".format(len(self.annotation_dictionary))) self.logger.debug("VAR:Annotations extracted, now exiting function") @@ -141,7 +153,7 @@ def apply_expert_rules(self, interpretation_destination): # check if position within promoter regions if self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" # otherwise, check if it is an upstream gene variant if "upstream_gene_variant" in self.type: @@ -168,7 +180,7 @@ def apply_expert_rules(self, interpretation_destination): # check if position within promoter regions if self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" else: self.logger.debug("VAR:The position is not within the special positions or the proximal promoter region; interpretation is 'S/U'") @@ -192,7 +204,7 @@ def apply_expert_rules(self, interpretation_destination): # check if position within promoter regions if self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" elif ("upstream_gene_variant" in self.type): self.logger.debug("VAR:The mutation type is a NONsynonymous variant, not in the proximal promoter region, and is an upstream gene variant; interpretation is 'S/U'") @@ -230,7 +242,7 @@ def apply_expert_rules(self, interpretation_destination): if self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" elif ("upstream_gene_variant" in self.type): self.logger.debug("VAR:The position is not within the special positions, not in the proximal promoter region, is nonsynomyous but is an upstream gene variant; interpretation is 'S/U'") @@ -255,7 +267,7 @@ def apply_expert_rules(self, interpretation_destination): elif self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" else: self.logger.debug("VAR:The position is not within the special positions, and not in the proximal promoter region,; interpretation is 'S/U'") @@ -267,7 +279,7 @@ def apply_expert_rules(self, interpretation_destination): return "Srule3.2.4" elif self.gene_name in globals.WHOV2_PROMOTER_REGIONS.keys() and globals.is_within_range(position_nt, globals.WHOV2_PROMOTER_REGIONS[self.gene_name]): self.logger.debug("VAR:The position is within the proximal promoter region; interpretation is 'U'") - return "Urule6" + return "Uwhov2" elif ("upstream_gene_variant" in self.type): self.logger.debug("VAR:The mutation is a nonsynonymous variant and not in the proximal promoter region, but is an upstream gene variant; interpretation is 'S/U'") return "Srule3.2.4" if interpretation_destination == "mdl" else "Urule3.2.4" diff --git a/tbp_parser/__init__.py b/tbp_parser/__init__.py index be0fcb3..424bd53 100644 --- a/tbp_parser/__init__.py +++ b/tbp_parser/__init__.py @@ -1 +1 @@ -__VERSION__ = "v2.1.1" \ No newline at end of file +__VERSION__ = "v2.2.0" \ No newline at end of file diff --git a/tbp_parser/globals.py b/tbp_parser/globals.py index 6955507..c097c26 100644 --- a/tbp_parser/globals.py +++ b/tbp_parser/globals.py @@ -282,6 +282,17 @@ def is_within_range(position, range_positions): "streptomycin": ["gid", "rpsL", "rrs"] } +""" +A dictionary that converts the mutation comment to a reportable LIMS values for the corresponding mutation +The key is the search term in the comment (comment.contains(key)) and the value will be output in square brackets +""" +global COMMENT_TO_LIMS +COMMENT_TO_LIMS = { + "Low-level resistance (multiple, genetically linked low-level resistance mutations are additive and confer high-level resistance)": "Low-level resistance", + "High-level resistance": "High-level resistance" +} + + """ A dictionary that will contain the percent coverage for each gene """ @@ -365,8 +376,8 @@ def is_within_range(position, range_positions): "folC": ["para-aminosalicylic_acid"], "fprA": ["amikacin", "capreomycin"], "gid": ["streptomycin"], - "gyrA": ["ciprofloxacin", "fluoroquinolones", "levofloxacin", "moxifloxacin", "ofloxacin"], - "gyrB": ["ciprofloxacin", "fluoroquinolones", "levofloxacin", "moxifloxacin", "ofloxacin"], + "gyrA": ["levofloxacin", "moxifloxacin"], + "gyrB": ["levofloxacin", "moxifloxacin"], "inhA": ["ethionamide", "isoniazid"], "kasA": ["isoniazid"], "katG": ["isoniazid"], @@ -550,8 +561,7 @@ def is_within_range(position, range_positions): """ global LINEAGE_ENGLISH LINEAGE_ENGLISH = "" - - + """ A list that will contain the names of genes that have coverages below the coverage threshold (see also COVERAGE_THRESHOLD) @@ -745,7 +755,7 @@ def is_within_range(position, range_positions): "rule3.2.2": "Expert rule 3.2.3. gyrA QRDR", "rule3.2.3": "Expert rule 3.2.3. gyrB QRDR", "rule3.2.4": "No WHO annotation or expert rule", - "rule6": "Mutation in proximal promoter region" + "whov2": "Mutation in proximal promoter region" } """ From 695d832529af975a2a4c84a2280a87a3464f3a28 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 10 Dec 2024 16:41:59 +0000 Subject: [PATCH 2/6] add source field and remove aminoglycosides from rrs --- docs/versioning/exhaustive.md | 2 +- tbp_parser/Variant.py | 2 +- tbp_parser/globals.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/versioning/exhaustive.md b/docs/versioning/exhaustive.md index 93e98d6..89f65cc 100644 --- a/docs/versioning/exhaustive.md +++ b/docs/versioning/exhaustive.md @@ -65,7 +65,7 @@ The following is a list of every version of `tbp-parser` and a short summary of - v2.1.0 - any mutations in the 60 proximal promoter regions included in the WHO v2 database (Table 22, page 89-90). *Use either the smw-tbprofiler-updates-dev branch until the time of the v2.3.0 release of TheiaProk on Terra for this and all subsequent v2.1.x+ versions* - Earlier versions are now deprecated and will no longer be supported. - v2.1.1 - adds the source and comment fields from TBDB to the Laboratorian report; fixes a bug where mmpR5 was not being completly renamed to Rv0678; fixes a bug where mutations that didn't share the same position were being compared -- v2.2.0 - removes ciprofloxacin, fluoroquinolones, and ofloxacin from gyrA and gyrB in the `globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME` dictionary; if a drug is missing in the TBProfiler JSON's gene_associated_drug field that is present in that global dictionary, it will be added for the mutation. +- v2.2.0 - removes ciprofloxacin, fluoroquinolones, and ofloxacin from gyrA and gyrB and aminoglycosides from rrs in the `globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME` dictionary; if a drug is missing in the TBProfiler JSON's gene_associated_drug field that is present in that global dictionary, it will be added for the mutation. --- diff --git a/tbp_parser/Variant.py b/tbp_parser/Variant.py index 064d339..6ff693d 100644 --- a/tbp_parser/Variant.py +++ b/tbp_parser/Variant.py @@ -116,7 +116,7 @@ def extract_annotations(self): for antimicrobial in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME[self.gene_name]: if antimicrobial not in self.annotation_dictionary.keys(): self.logger.debug("VAR: The drug ({}) was not found in the gene associated drug list; adding it with a WHO confidence of 'No WHO annotation'".format(antimicrobial)) - self.annotation_dictionary[antimicrobial] = Row(self.logger, self, "No WHO annotation", antimicrobial) + self.annotation_dictionary[antimicrobial] = Row(self.logger, self, "No WHO annotation", antimicrobial, source="Mutation effect for given drug is not in TBDB") self.logger.debug("VAR:The annotation dictionary has all gene associated drugs included; it now has a length of {}".format(len(self.annotation_dictionary))) diff --git a/tbp_parser/globals.py b/tbp_parser/globals.py index c097c26..316789a 100644 --- a/tbp_parser/globals.py +++ b/tbp_parser/globals.py @@ -397,7 +397,7 @@ def is_within_range(position, range_positions): "rpsA": ["pyrazinamide"], "rpsL": ["streptomycin"], "rrl": ["linezolid"], - "rrs": ["amikacin", "aminoglycosides", "capreomycin", "kanamycin", "linezolid"], + "rrs": ["amikacin", "capreomycin", "kanamycin", "linezolid"], "Rv0678": ["bedaquiline", "clofazimine"], "Rv1258c": ["isoniazid", "pyrazinamide", "streptomycin"], "Rv1979c": ["bedaquiline", "clofazimine"], From 61405652687d2e71bb74301e0399ecac3b6ca5c0 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 11 Dec 2024 18:25:29 +0000 Subject: [PATCH 3/6] prevent future warning --- tbp_parser/Coverage.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tbp_parser/Coverage.py b/tbp_parser/Coverage.py index 6b61427..6eb284d 100644 --- a/tbp_parser/Coverage.py +++ b/tbp_parser/Coverage.py @@ -140,8 +140,10 @@ def reformat_coverage(self): except: self.logger.error("An expected gene ({}) was not found in laboratorian report.\nSomething may have gone wrong.".format(gene)) - DF_COVERAGE = pd.concat([DF_COVERAGE, pd.DataFrame({"Gene": gene, "Percent_Coverage": percent_coverage, "Warning": warning}, index=[0])], ignore_index=True) - + if len(DF_COVERAGE) == 0: + DF_COVERAGE = pd.DataFrame({"Gene": gene, "Percent_Coverage": percent_coverage, "Warning": warning}, index=[0]) + else: + DF_COVERAGE = pd.concat([DF_COVERAGE, pd.DataFrame({"Gene": gene, "Percent_Coverage": percent_coverage, "Warning": warning}, index=[0])], ignore_index=True) if self.tngs: self.logger.debug("COV:Merging the tNGS expert rule regions coverage with the initial coverage report and renaming columns") From 081d2844e99a7ad6c22fb7ff759892291e2bf4fb Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 11 Dec 2024 18:26:20 +0000 Subject: [PATCH 4/6] make sure rifampicin is rifampin --- tbp_parser/LIMS.py | 2 +- tbp_parser/Laboratorian.py | 2 +- tbp_parser/Row.py | 4 +--- tbp_parser/Variant.py | 12 +++++------- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/tbp_parser/LIMS.py b/tbp_parser/LIMS.py index dcaac46..91c7a2d 100644 --- a/tbp_parser/LIMS.py +++ b/tbp_parser/LIMS.py @@ -281,7 +281,7 @@ def apply_lims_rules(self, gene_dictionary, DF_LIMS, max_mdl_resistance, antimic elif substitution not in mutations_per_gene[gene]: mutations_per_gene[gene] = "{}; {}".format("".join(mutations_per_gene[gene]), substitution) else: - self.logger.debug("LIMS:This mutation (\"{}\", origin gene: {}) is not being added to the LIMS report because it is a non-rpoB RRDR \"S\" mutation".format(mutation, gene)) + self.logger.debug("LIMS:This mutation (\"{}\", origin gene: {}) is not being added to the LIMS report because it is not an rpoB RRDR \"S\" mutation".format(mutation, gene)) # Mutations for a particular gene have been added to the mutations_per_gene dictionary. # if that gene has mutations associated with it, we want to perform some additional filtration, diff --git a/tbp_parser/Laboratorian.py b/tbp_parser/Laboratorian.py index d3f9815..326424c 100644 --- a/tbp_parser/Laboratorian.py +++ b/tbp_parser/Laboratorian.py @@ -56,7 +56,7 @@ def iterate_section(self, variant_section, row_list): # extract all of the annotations for the variant variant.extract_annotations() - self.logger.debug("LAB:The current variant (gene: {}) has {} annotations; now iterating through them".format(variant.gene_name, len(variant.annotation_dictionary))) + self.logger.debug("LAB:The current variant (gene: {}) has {} annotation(s); now iterating through them".format(variant.gene_name, len(variant.annotation_dictionary))) for annotation_row in variant.annotation_dictionary.values(): # complete the row objects annotation_row.complete_row() diff --git a/tbp_parser/Row.py b/tbp_parser/Row.py index cf8498e..3455c58 100644 --- a/tbp_parser/Row.py +++ b/tbp_parser/Row.py @@ -51,7 +51,7 @@ def __init__(self, logger, variant, who_confidence, drug, gene_name=None, depth= self.tbprofiler_variant_substitution_type = self.variant.type self.tbprofiler_variant_substitution_nt = self.variant.nucleotide_change self.tbprofiler_variant_substitution_aa = self.variant.protein_change - self.logger.debug("ROW:This mutations is a {} with {} and {}".format(self.tbprofiler_variant_substitution_type, self.tbprofiler_variant_substitution_nt, self.tbprofiler_variant_substitution_aa)) + self.logger.debug("ROW:This mutation is a {} with nucleotide change \"{}\" and protein change \"{}\"".format(self.tbprofiler_variant_substitution_type, self.tbprofiler_variant_substitution_nt, self.tbprofiler_variant_substitution_aa)) # change blank aa substitutions to NA if self.tbprofiler_variant_substitution_aa == "": self.tbprofiler_variant_substitution_aa = "NA" @@ -283,7 +283,6 @@ def complete_row(self): self.logger.debug("ROW:Interpretation logic applied or skipped; now removing any 'noexpert' suffixes") self.describe_rationale() - self.logger.debug("rationale = {}".format(self.rationale)) self.logger.debug("ROW:Finished completing the row's values, now exiting function") def rank_annotation(self): @@ -328,7 +327,6 @@ def describe_rationale(self): if any(rule in self.mdl_interpretation for rule in globals.RULE_TO_RATIONALE.keys()): interpretation = self.mdl_interpretation[0] rule = self.mdl_interpretation[1:] - self.logger.debug("rule={}".format(rule)) self.mdl_interpretation = interpretation self.logger.debug(globals.RULE_TO_RATIONALE[rule]) globals.RULE_TO_RATIONALE[rule] diff --git a/tbp_parser/Variant.py b/tbp_parser/Variant.py index 6ff693d..8908388 100644 --- a/tbp_parser/Variant.py +++ b/tbp_parser/Variant.py @@ -122,9 +122,11 @@ def extract_annotations(self): else: # possibilities 1b and 2: the annotation field has no content or the field does not exist - self.logger.debug("VAR:The annotation field has no content or does not exist. Now iterating through gene associated drugs.") + self.logger.debug("VAR:The annotation field has no content or does not exist. Now iterating through gene associated drugs and gene-drug combination dictionary.") for drug in self.gene_associated_drugs: + if drug == "rifampicin": + drug = "rifampin" self.annotation_dictionary[drug] = Row(self.logger, self, "No WHO annotation", drug) if self.gene_name in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME.keys(): @@ -132,15 +134,13 @@ def extract_annotations(self): if antimicrobial not in self.annotation_dictionary.keys(): self.annotation_dictionary[antimicrobial] = Row(self.logger, self, "No WHO annotation", antimicrobial) - self.logger.debug("VAR:After iterating through gene_associated_drugs, the annotation dictionary has a length of {}".format(len(self.annotation_dictionary))) - - self.logger.debug("VAR:Annotations extracted, now exiting function") + self.logger.debug("VAR:{} annotation(s) extracted, now exiting function".format(len(self.annotation_dictionary))) def apply_expert_rules(self, interpretation_destination): """ Apply rules 1-3 from the CDPH interpretation logic document regarding the interpretation of potential resistance mutations. """ - self.logger.debug("VAR:Within the Variant class apply_expert_rules function") + self.logger.debug("VAR:Within the Variant class apply_expert_rules function for {}".format(interpretation_destination)) position_nt = globals.get_position(self.nucleotide_change) position_aa = globals.get_position(self.protein_change) @@ -216,8 +216,6 @@ def apply_expert_rules(self, interpretation_destination): # rules 2.2.2.1 and 3.2.2 & 3.2.3 elif self.gene_name in ["gyrA", "gyrB", "rpoB"]: self.logger.debug("VAR:The gene is {}, now checking if the position requires special consideration".format(self.gene_name)) - self.logger.debug("VAR:length of aa: {}".format(len(position_aa))) - self.logger.debug("VAR:SEPCIAL POSITIONS: {}".format(globals.SPECIAL_POSITIONS[self.gene_name])) if globals.is_within_range(position_aa, globals.SPECIAL_POSITIONS[self.gene_name]): self.logger.debug("VAR:The position is within the special positions; interpretation is 'R' if rpoB (or 'U' if not) and nonsynonymous, else 'S'") From 195ed3556c00b2a0963830683929650daa54ab3f Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 11 Dec 2024 18:29:01 +0000 Subject: [PATCH 5/6] update documentation and versions --- Dockerfile | 4 ++-- docs/inputs/theiaprok.md | 2 +- docs/usage.md | 4 ++-- docs/versioning/exhaustive.md | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8a95eaf..bf6da7c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # Shelby Bennett, Erin Young, Curtis Kapsak, & Kutluhan Incekara ARG SAMTOOLS_VER="1.18" -ARG TBP_PARSER_VER="2.2.0" +ARG TBP_PARSER_VER="2.2.1" FROM ubuntu:jammy AS builder @@ -42,7 +42,7 @@ ARG TBP_PARSER_VER LABEL base.image="ubuntu:jammy" LABEL dockerfile.version="1" LABEL software="tbp-parser" -LABEL software.version="2.2.0" +LABEL software.version="2.2.1" LABEL description="tbp-parser and samtools" LABEL website="https://github.com/theiagen/tbp-parser" LABEL license="https://github.com/theiagen/tbp-parser/blob/main/LICENSE" diff --git a/docs/inputs/theiaprok.md b/docs/inputs/theiaprok.md index 0ebed59..71aefa6 100644 --- a/docs/inputs/theiaprok.md +++ b/docs/inputs/theiaprok.md @@ -29,7 +29,7 @@ The following optional inputs are also available for user modification on Terra: | `merlin_magic` | **tbp_parser_coverage_regions_bed** | File | A BED file containing the regions to calculate percent coverage for | [tbdb-modified-regions.md](https://github.com/theiagen/tbp-parser/blob/main/data/tbdb-modified-regions.bed) | | `merlin_magic` | **tbp_parser_coverage_threshold** | Int | The minimum percentage of a region that has depth above the threshold set by `min_depth` (used for a gene/locus to pass QC) | 100 | | `merlin_magic` | **tbp_parser_debug** | Boolean | Set to `false` to turn off debug mode for `tbp-parser` | `true` | -| `merlin_magic` | **tbp_parser_docker_image** | String | The Docker image to use when running `tbp-parser` | "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0" | +| `merlin_magic` | **tbp_parser_docker_image** | String | The Docker image to use when running `tbp-parser` | "us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.1" | | `merlin_magic` | **tbp_parser_etha237_frequency** | Float | Minimum frequency for a mutation in ethA at protein position 237 to pass QC in `tbp-parser` | 0.1 | | `merlin_magic` | **tbp_parser_expert_rule_regions_bed** | File | A file that contains the regions where R mutations and expert rules are applied | | | `merlin_magic` | **tbp_parser_min_depth** | Int | Minimum depth for a variant to pass QC in tbp_parser | 10 | diff --git a/docs/usage.md b/docs/usage.md index 0f214b6..47404f6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -9,7 +9,7 @@ title: Getting Started We highly recommend using the following Docker iamge to run tbp-parser: ``` bash -docker pull us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0 #(1)! +docker pull us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.1 #(1)! ``` 1. We host our Docker images on the Google Artifact Registry so that they are always availble for usage. @@ -17,7 +17,7 @@ docker pull us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0 #(1)! The entrypoint for this Docker image is the `tbp-parser` help message. To run this container *interactively*, you can use the following command: ``` bash -docker run -it --entrypoint=/bin/bash us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.0 +docker run -it --entrypoint=/bin/bash us-docker.pkg.dev/general-theiagen/theiagen/tbp-parser:2.2.1 # Once inside the container interactively, you can run the tbp-parser tool python3 /tbp-parser/tbp_parser/tbp_parser.py -v diff --git a/docs/versioning/exhaustive.md b/docs/versioning/exhaustive.md index 89f65cc..02c5f4a 100644 --- a/docs/versioning/exhaustive.md +++ b/docs/versioning/exhaustive.md @@ -66,7 +66,8 @@ The following is a list of every version of `tbp-parser` and a short summary of - Earlier versions are now deprecated and will no longer be supported. - v2.1.1 - adds the source and comment fields from TBDB to the Laboratorian report; fixes a bug where mmpR5 was not being completly renamed to Rv0678; fixes a bug where mutations that didn't share the same position were being compared - v2.2.0 - removes ciprofloxacin, fluoroquinolones, and ofloxacin from gyrA and gyrB and aminoglycosides from rrs in the `globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME` dictionary; if a drug is missing in the TBProfiler JSON's gene_associated_drug field that is present in that global dictionary, it will be added for the mutation. - +- v2.2.1 - fixes a bug where rifampicin was not being renamed to rifampin and resulted in duplicate rows in the laboratorian report +- --- The following diagram shows how each version is related to the others without technical details: From cdcf302ffa8f2a10db71afc289e3e968bfebd0f0 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 11 Dec 2024 18:33:43 +0000 Subject: [PATCH 6/6] remove duplicated thing --- tbp_parser/Variant.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tbp_parser/Variant.py b/tbp_parser/Variant.py index b568704..b362daf 100644 --- a/tbp_parser/Variant.py +++ b/tbp_parser/Variant.py @@ -112,13 +112,6 @@ def extract_annotations(self): self.logger.debug("VAR:The drug ({}) was not found in the annotation dictionary; adding it with a WHO confidence of 'No WHO annotation'".format(drug)) self.annotation_dictionary[drug] = Row(self.logger, self, "No WHO annotation", drug) - if self.gene_name in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME.keys(): - for antimicrobial in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME[self.gene_name]: - if antimicrobial not in self.annotation_dictionary.keys(): - self.logger.debug("VAR: The drug ({}) was not found in the gene associated drug list; adding it with a WHO confidence of 'No WHO annotation'".format(antimicrobial)) - self.annotation_dictionary[antimicrobial] = Row(self.logger, self, "No WHO annotation", antimicrobial, source="Mutation effect for given drug is not in TBDB") - - if self.gene_name in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME.keys(): for antimicrobial in globals.GENE_TO_ANTIMICROBIAL_DRUG_NAME[self.gene_name]: if antimicrobial not in self.annotation_dictionary.keys():