Merge pull request #116 from CDCgov/v2.0.2

V2.0.2
CDCgov · Aug 3, 2023 · e4dd601 · e4dd601
2 parents 6784d46 + aa021d3
commit e4dd601
Show file tree

Hide file tree

Showing 59 changed files with 654 additions and 179 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -140,9 +140,37 @@ Below are the list of changes to phx since is initial release. As fixes can take
 [Full Changelog](https://github.com/CDCgov/phoenix/compare/v2.0.0...v2.0.1)
 
 **Implemented Enhancements:**  
-- Updated nextflow towwer scheme that describes inputs.
+- Updated nextflow tower scheme that describes inputs.
 
 **Fixed Bugs:**  
 - Typo fix and changed branch called in Terra task that caused Terra version to crash.
 
+## [v2.0.2](https://github.com/CDCgov/phoenix/releases/tag/v2.0.2) (08/03/2023)
+
+[Full Changelog](https://github.com/CDCgov/phoenix/compare/v2.0.1...v2.0.2)
+
+**Implemented Enhancements:**  
+- Added handling for -entry `SCAFFOLDS` and `CDC_SCAFFOLDS` to accept assemblies from tricylcer and flye.  
+- Added tsv version of GRiPHin_Summary.xlsx  
+
+**Output File Changes:**  
+- GRiPHin_samplesheet.csv changed to Directory_samplesheet.csv  
+- In response to feedback from compliance program, "report" is being replaced by "summary" in file names to avoid confusion regarding the difference between public health results (i.e. summary) and diagnostic results (i.e. report). 
+  - GRiPHin_Report.xlsx changed to GRiPHin_Summary.xlsx  
+  - Phoenix_Output_Report.tsv changed to Phoenix_Summary.tsv  
+  - quast/${samplename}_report.txt changed to quast/${samplename}_summary.tsv  
+  - kraken2_trimd/${samplename}.trimd_summary.txt changed to kraken2_asmbld/${samplename}.kraken2_trimd.top_kraken_hit.txt  
+  - kraken2_asmbld/${samplename}.asmbld_summary.txt changed to kraken2_asmbld/${samplename}.kraken2_asmbld.top_kraken_hit.txt  
+  - kraken2_asmbld_weighted/${samplename}.wtasmbld_summary.txt changed to kraken2_asmbld/${samplename}.kraken2_wtasmbld.top_kraken_hit.txt  
+  - kraken2_trimd/${samplename}.kraken2_trimd.report.txt changed to kraken2_trimd/${samplename}.kraken2_trimd.summary.txt  
+  - kraken2_asmbld/${samplename}.kraken2_asmbld.report.txt changed to kraken2_asmbld/${samplename}.kraken2_asmbld.summary.txt  
+  - kraken2_asmbld_weighted/${samplename}.kraken2_wtasmbld.report.txt changed to kraken2_asmbld_weighted/${samplename}.kraken2_wtasmbld.summary.txt  
+
+**Fixed Bugs:**  
+- For MLST when final alleles were assigned, PHX called 100% match despite 1 allele not being a match.  
+- MLST step not using the custom database. A custom MLST container was added with this database included.  
+
+**Container Updates:**  
+- MLST version remains the same, but a custom database was added so that it no longer uses the database included in the software. Now hosted on quay.io.  
+- Bumped up base container (v2.0.2) to have openpyxl module.  
 
diff --git a/Dockerfiles/Dockerfile_Terra b/Dockerfiles/Dockerfile_Terra
@@ -17,6 +17,8 @@ RUN micromamba create -n phoenix -c defaults -c bioconda -c conda-forge \
     conda-forge::biopython \
     conda-forge::rsync \
     conda-forge::xlsxwriter \
+    anaconda::pandas=1.3.5 \
+    anaconda::openpyxl \
     conda-forge::bc \
     conda-forge::wget \
     conda-forge::ca-certificates \
@@ -59,4 +61,10 @@ ENV PATH=/opt/conda/envs/phoenix/bin:/opt/conda/envs/amrfinderplus/bin:/opt/cond
 /opt/conda/bin:/opt/conda/envs/env/bin:/opt/conda/envs/env/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 
 #setting up stuff for BUSCO
-ENV AUGUSTUS_CONFIG_PATH=/opt/conda/envs/busco/config/
+ENV AUGUSTUS_CONFIG_PATH=/opt/conda/envs/busco/config/
+
+#remove db and add custom mlst db
+RUN rm /opt/conda/envs/phoenix/db/scheme_species_map.tab && rm -r /opt/conda/envs/phoenix/db/pubmlst && rm -r /opt/conda/envs/phoenix/db/blast
+COPY --chown=mambauser:mambauser db /opt/conda/envs/phoenix/db/
+#change permissions of database files
+RUN chmod 755 --recursive /opt/conda/envs/phoenix/db/*
diff --git a/Dockerfiles/Dockerfile_base b/Dockerfiles/Dockerfile_base
@@ -68,4 +68,5 @@ RUN pip3 install biopython \
     times \
     xlsxwriter \
     cryptography==36.0.2 \
-    pytest-shutil
+    pytest-shutil \
+    openpyxl 
diff --git a/Dockerfiles/Dockerfile_mlst b/Dockerfiles/Dockerfile_mlst
@@ -0,0 +1,57 @@
+FROM ubuntu:focal as app
+
+ARG MLST_VER="2.23.0"
+ARG ANY2FASTA_VER="0.4.2"
+
+LABEL base.image="ubuntu:focal"
+LABEL dockerfile.version="1"
+LABEL software="mlst"
+LABEL software.version="${MLST_VER}"
+LABEL description="Scan contig files against PubMLST typing schemes"
+LABEL website="https://github.com/tseemann/mlst"
+LABEL license="https://github.com/tseemann/mlst/blob/master/LICENSE"
+LABEL maintainer="Jill Hagey"
+LABEL maintainer.email="[email protected]"
+
+# install dependencies via apt; cleanup apt garbage
+# blast from ubuntu:focal is v2.9.0 (as of 2022-05-09)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ wget \
+ ca-certificates \
+ libmoo-perl \
+ liblist-moreutils-perl \
+ libjson-perl \
+ gzip \
+ file \
+ ncbi-blast+ && \
+ apt-get autoclean && rm -rf /var/lib/apt/lists/*
+
+# get any2fasta; move binary to /usr/local/bin which is already in $PATH
+RUN wget https://github.com/tseemann/any2fasta/archive/refs/tags/v${ANY2FASTA_VER}.tar.gz && \
+ tar xzf v${ANY2FASTA_VER}.tar.gz && \
+ rm v${ANY2FASTA_VER}.tar.gz && \
+ chmod +x any2fasta-${ANY2FASTA_VER}/any2fasta && \
+ mv -v any2fasta-${ANY2FASTA_VER}/any2fasta /usr/local/bin
+
+# get mlst
+RUN wget https://github.com/tseemann/mlst/archive/v${MLST_VER}.tar.gz && \
+ tar -xzf v${MLST_VER}.tar.gz && \
+ rm v${MLST_VER}.tar.gz
+
+# set PATH and perl local settings
+ENV PATH="${PATH}:/mlst-${MLST_VER}/bin:" \
+    LC_ALL=C.UTF-8
+
+# check dependencies and list available schemes
+RUN mlst --check && mlst --list
+
+#set working directory so its easier to find the db
+WORKDIR /data
+
+#remove db that comes with the mlst software
+RUN rm -r /mlst-${MLST_VER}/db/*
+# add in custom db and move to correct location
+COPY db /data/db
+RUN mv /data/db ../mlst-${MLST_VER}
+#change permissions of database files
+RUN chmod 755 --recursive /mlst-${MLST_VER}/db/*
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 <!-- [![GitHub Downloads](https://img.shields.io/github/downloads/CDCgov/phoenix/total.svg?style=social&logo=github&label=Download)](https://github.com/CDCgov/phoenix/releases) -->
 
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.8148569.svg)](https://doi.org/10.5281/zenodo.8148569)
+[![DOI](https://zenodo.org/badge/490844937.svg)](https://zenodo.org/badge/latestdoi/490844937)
 
 [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/)
 [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)

diff --git a/assets/Example-Microbe.1.0_Example_Data.xlsx b/assets/Example-Microbe.1.0_Example_Data.xlsx
diff --git a/assets/SRA_metadata_2023-003_VIM_CRPA.xlsx b/assets/SRA_metadata_2023-003_VIM_CRPA.xlsx
diff --git a/assets/databases/mlst_db.tar.gz → assets/databases/mlst_db_20230728.tar.gz b/assets/databases/mlst_db.tar.gz → assets/databases/mlst_db_20230728.tar.gz
diff --git a/assets/osii-bioprojects.yaml b/assets/osii-bioprojects.yaml
@@ -0,0 +1,23 @@
+HAISeq: PRJNA531911
+Gram_positive:
+Actinobacteria(P):
+Non_tuberculous mycobacterium:
+Mycobacterium(G): PRJNA542112
+Mycobacteroides(G): PRJNA542112
+Mycolicibacterium(G): PRJNA542112
+Tuberculous mycobacterium:
+Mycobacterium tuberculosis(G): blank
+Non_mycobacterium:
+Cutibacterium(G): blank
+Firmicutes(P):
+Bacillus(G): PRKNA590648
+Clostridioides(G) difficile(s): PRJNA629351
+Staphylococcus(G) non-aureus species(s): PRJNA591020
+Staphylococcus(G) aureus(s): PRJNA533550
+Enterococcus(G): PRJNA573902
+Others: blank
+Gram_negative:
+Proteobacteria(P): PRJNA288601
+Pseudomonadota(P): PRJNA288601
+Bacteroidetes(P): PRJNA288601
+Others: blank
diff --git a/bin/GRiPHin.py b/bin/GRiPHin.py
@@ -26,7 +26,7 @@ def parseArgs(args=None):
     parser.add_argument('-d', '--directory', default=None, required=False, dest='directory', help='If a directory is given rather than samplesheet GRiPHin will create one for all samples in the directory.')
     parser.add_argument('-c', '--control_list', required=False, dest='control_list', help='CSV file with a list of sample_name,new_name. This option will output the new_name rather than the sample name to "blind" reports.')
     parser.add_argument('-a', '--ar_db', default=None, required=True, dest='ar_db', help='AR Gene Database file that is used to confirm srst2 gene names are the same as GAMMAs output.')
-    parser.add_argument('-o', '--output', default="", required=False, dest='output', help='Name of output file default is GRiPHin_Report.xlsx.')
+    parser.add_argument('-o', '--output', default="", required=False, dest='output', help='Name of output file default is GRiPHin_Summary.xlsx.')
     parser.add_argument('--coverage', default=30, required=False, dest='set_coverage', help='The coverage cut off default is 30x.')
     parser.add_argument('--scaffolds', dest="scaffolds", default=False, action='store_true', help='Turn on with --scaffolds to keep samples from failing/warnings/alerts that are based on trimmed data. Default is off.')
     parser.add_argument('--phoenix', dest="phoenix", default=False, action='store_true', required=False, help='Use for -entry PHOENIX rather than CDC_PHOENIX, which is the default.')
@@ -272,7 +272,7 @@ def compile_warnings(scaffolds_entry, Total_Trimmed_reads, Q30_R1_per, Q30_R2_pe
             warnings.append(">{:.2f}% unclassifed scaffolds".format(int(30)))
         if float(Asmbld_Genus_percent) <float(50.00):
             warnings.append("<50% of weighted scaffolds assigned to top genera hit ({:.2f}%)".format(float(Asmbld_Genus_percent)))
-    else:
+    elif scaffolds == "Unknown" and Wt_asmbld_unclassified_percent == "Unknown" and Asmbld_Genus_percent == "Unknown":
         warnings.append("No assembly file found possible SPAdes failure.")
     if len(kraken_wtasmbld_genus) >=2:
         warnings.append(">=2 genera had >{:.2f}% of wt scaffolds assigned to them".format(int(25))) 
@@ -321,7 +321,7 @@ def parse_kraken_report(kraken_trim_report, kraken_wtasmbld_report, sample_name)
                         if genus_percent >= 25.00:
                             kraken_trim_genus.append(thing.replace(' ',''))
     except FileNotFoundError:
-        print("Warning: " + sample_name + ".kraken2_trimd.report.txt not found")
+        print("Warning: " + sample_name + ".kraken2_trimd.summary.txt not found")
         kraken_trim_genus = 'Unknown'
     try:
         #file is a VERY WERID so need some extra arguments
@@ -343,7 +343,7 @@ def parse_kraken_report(kraken_trim_report, kraken_wtasmbld_report, sample_name)
                         if genus_percent >= 25.00:
                             kraken_wtasmbld_genus.append(thing.replace(' ',''))
     except FileNotFoundError:
-        print("Warning: " + sample_name + ".kraken2_wtasmbld.report.txt not found")
+        print("Warning: " + sample_name + ".kraken2_wtasmbld.summary.txt not found")
         kraken_wtasmbld_report = 'Unknown'
     return kraken_trim_genus, kraken_wtasmbld_genus
 
@@ -648,12 +648,12 @@ def Get_Metrics(phoenix_entry, scaffolds_entry, set_coverage, srst2_ar_df, pf_df
         else:
             Coverage = Assembly_Length = 'Unknown'
     except FileNotFoundError:
-        print("Warning: " + sample_name + "_report.tsv not found")
+        print("Warning: " + sample_name + "_summary.tsv not found")
         Coverage = Assembly_Length = 'Unknown'
     try:
         Scaffold_Count = get_scaffold_count(quast_report)
     except FileNotFoundError:
-        print("Warning: " + sample_name + "_report.tsv not found")
+        print("Warning: " + sample_name + "_summary.tsv not found")
         Scaffold_Count = 'Unknown'
     try:
         busco_metrics = Get_BUSCO_Gene_Count(busco_short_summary)
@@ -783,11 +783,11 @@ def Get_Files(directory, sample_name):
     # create file names
     trim_stats = directory + "/qc_stats/" + sample_name + "_trimmed_read_counts.txt"
     raw_stats = directory + "/raw_stats/" + sample_name + "_raw_read_counts.txt"
-    kraken_trim = directory + "/kraken2_trimd/" + sample_name + ".trimd_summary.txt"
-    kraken_trim_report = directory + "/kraken2_trimd/" + sample_name + ".kraken2_trimd.report.txt"
-    kraken_wtasmbld = directory + "/kraken2_asmbld_weighted/" + sample_name + ".wtasmbld_summary.txt"
-    kraken_wtasmbld_report = directory + "/kraken2_asmbld_weighted/" + sample_name + ".kraken2_wtasmbld.report.txt"
-    quast_report = directory + "/quast/" + sample_name + "_report.tsv"
+    kraken_trim = directory + "/kraken2_trimd/" + sample_name + ".kraken2_trimd.top_kraken_hit.txt"
+    kraken_trim_report = directory + "/kraken2_trimd/" + sample_name + ".kraken2_trimd.summary.txt"
+    kraken_wtasmbld = directory + "/kraken2_asmbld_weighted/" + sample_name + ".kraken2_wtasmbld.top_kraken_hit.txt"
+    kraken_wtasmbld_report = directory + "/kraken2_asmbld_weighted/" + sample_name + ".kraken2_wtasmbld.summary.txt"
+    quast_report = directory + "/quast/" + sample_name + "_summary.tsv"
     mlst_file = directory + "/mlst/" + sample_name + "_combined.tsv"
     # This creates blank files for if no file exists. Varibles will be made into "Unknown" in the Get_Metrics function. Need to only do this for files determined by glob
     # You only need this for glob because glob will throw an index error if not.
@@ -1053,9 +1053,9 @@ def Combine_dfs(df, ar_df, pf_df, hv_df, srst2_ar_df, phoenix):
 def write_to_excel(set_coverage, output, df, qc_max_col, ar_gene_count, pf_gene_count, hv_gene_count, columns_to_highlight, ar_df, pf_db, ar_db, hv_db, phoenix):
     # Create a Pandas Excel writer using XlsxWriter as the engine.
     if output != "":
-        writer = pd.ExcelWriter((output + '_GRiPHin_Report.xlsx'), engine='xlsxwriter')
+        writer = pd.ExcelWriter((output + '_GRiPHin_Summary.xlsx'), engine='xlsxwriter')
     else:
-        writer = pd.ExcelWriter(('GRiPHin_Report.xlsx'), engine='xlsxwriter')
+        writer = pd.ExcelWriter(('GRiPHin_Summary.xlsx'), engine='xlsxwriter')
     # Convert the dataframe to an XlsxWriter Excel object.
     df.to_excel(writer, sheet_name='Sheet1', index=False, startrow=1)
     # Get the xlsxwriter workfbook worksheet objects for formating
@@ -1186,13 +1186,13 @@ def blind_samples(final_df, control_file):
 def create_samplesheet(directory):
     """Function will create a samplesheet from samples in a directory if -d argument passed."""
     directory = os.path.abspath(directory) # make sure we have an absolute path to start with
-    with open("GRiPHin_samplesheet.csv", "w") as samplesheet:
+    with open("Directory_samplesheet.csv", "w") as samplesheet:
         samplesheet.write('sample,directory\n')
     dirs = sorted(os.listdir(directory))
     # If there are any new files added to the top directory they will need to be added here or you will get an error
-    skip_list_a = glob.glob(directory + "/*_GRiPHin_Report.xlsx") # for if griphin is run on a folder that already has a report in it
+    skip_list_a = glob.glob(directory + "/*_GRiPHin_Summary.*") # for if griphin is run on a folder that already has a report in it
     skip_list_a = [ gene.split('/')[-1] for gene in skip_list_a ]  # just get the excel name not the full path
-    skip_list_b = ["Phoenix_Output_Report.tsv", "pipeline_info", "GRiPHin_Report.xlsx", "multiqc", "samplesheet_converted.csv", "GRiPHin_samplesheet.csv", "sra_samplesheet.csv"]
+    skip_list_b = ["BiosampleAttributes_Microbe.1.0.xlsx", "Sra_Microbe.1.0.xlsx", "Phoenix_Summary.tsv", "pipeline_info", "GRiPHin_Summary.xlsx", "multiqc", "samplesheet_converted.csv", "Directory_samplesheet.csv", "sra_samplesheet.csv"]
     skip_list = skip_list_a + skip_list_b
     #remove unwanted files
     dirs_cleaned = [item for item in dirs if item not in skip_list]
@@ -1201,11 +1201,11 @@ def create_samplesheet(directory):
     except: #if no numbers then use only alphabetically
         dirs_sorted=sorted(dirs_cleaned)
     for sample in dirs_sorted:
-        with open("GRiPHin_samplesheet.csv", "a") as samplesheet:
+        with open("Directory_samplesheet.csv", "a") as samplesheet:
             if directory[-1] != "/": # if directory doesn't have trailing / add one
                 directory = directory + "/"
             samplesheet.write(sample + "," + directory + sample + '\n')
-    samplesheet = "GRiPHin_samplesheet.csv"
+    samplesheet = "Directory_samplesheet.csv"
     return samplesheet
 
 def sort_samplesheet(samplesheet):
@@ -1220,6 +1220,21 @@ def sort_samplesheet(samplesheet):
     df = df.loc[samples_sorted]
     df.to_csv(samplesheet, sep=',', encoding='utf-8') #overwrite file
 
+def convert_excel_to_tsv(output):
+    '''Reads in the xlsx file that was just created, outputs as tsv version with first layer of headers removed'''
+    if output != "":
+        output_file = output + '_GRiPHin_Summary'
+    else:
+        output_file = 'GRiPHin_Summary'
+    #Read excel file into a dataframe
+    data_xlsx = pd.read_excel(output_file + '.xlsx', 'Sheet1', index_col=None, header=[1])
+    #Replace all fields having line breaks with space
+    #data_xlsx = data_xlsx.replace('\n', ' ',regex=True)
+    #drop the footer information
+    data_xlsx = data_xlsx.iloc[:-10] 
+    #Write dataframe into csv
+    data_xlsx.to_csv(output_file + '.tsv', sep='\t', encoding='utf-8',  index=False, line_terminator='\n')
+
 def main():
     args = parseArgs()
     # create empty lists to append to later
@@ -1272,6 +1287,8 @@ def main():
     else:
         final_df = final_df
     write_to_excel(args.set_coverage, args.output, final_df, qc_max_col, ar_max_col, pf_max_col, hv_max_col, columns_to_highlight, final_ar_df, pf_db, ar_db, hv_db, args.phoenix)
+    convert_excel_to_tsv(args.output)
+
 
 if __name__ == '__main__':
     main()
diff --git a/bin/fix_MLST2.py b/bin/fix_MLST2.py
@@ -368,7 +368,7 @@ def do_MLST_check(input_MLST_line_tuples, taxonomy_file, mlst_db_path):
 						db_alleles = db_line.split("\t")
 						match = False
 						db_type = db_alleles[0]
-						for i in range(1,len(current_alleles)):
+						for i in range(0,len(current_alleles)):
 							#print(i, len(db_alleles), len(current_alleles))
 							if db_alleles[i+1] == current_alleles[i]:
 								match = True
@@ -463,11 +463,16 @@ def do_MLST_check(input_MLST_line_tuples, taxonomy_file, mlst_db_path):
 							# Mark allele as found and complete
 							secondary_alleles.append('Complete')
 					if primary_alleles == secondary_alleles:
+						new_source="unset"
 						if primary_source == "assembly":
 							if secondary_source == "assembly":
 								print("Weird, shouldnt have both novel allele sets in one database come from assembly MLST")
+								# Not as weird, contamination testing this comes up, so a good way to catch?
+								new_source="assembly"
 							elif secondary_source == "assembly/reads":
-									print("Weird, shouldnt have both novel allele sets in one database come from assembly MLST (plus a confirmation srst2)")
+								print("Weird, shouldnt have both novel allele sets in one database come from assembly MLST (plus a confirmation srst2)")
+								# Not as weird, contamination testing this comes up, so a good way to catch?
+								new_source="assembly/reads"
 							elif secondary_source == "reads":
 								new_source = "assembly/reads"
 							else:
@@ -478,8 +483,12 @@ def do_MLST_check(input_MLST_line_tuples, taxonomy_file, mlst_db_path):
 								new_source = "assembly/reads"
 							elif secondary_source == "assembly/reads":
 								print("Weird, shouldnt have both novel allele sets in one database come from srst2 MLST (plus a confirmation assembly MLST)")
+								# Not as weird, contamination testing this comes up, so a good way to catch?
+								new_source="assembly/reads"
 							elif secondary_source == "reads":
 								print("Weird, shouldnt have both novel allele sets in one database come from srst2")
+								# Not as weird, contamination testing this comes up, so a good way to catch?
+								new_source="reads"
 							else:
 								print("Weird, dont know what the secondary source is")
 							secondary_is_srst2=True
@@ -514,7 +523,7 @@ def do_MLST_check(input_MLST_line_tuples, taxonomy_file, mlst_db_path):
 	outfile=isolate_name+"_combined.tsv"
 	print(outfile)
 	with open(outfile,'w') as writer:
-		writer.write("Sample\tSource\tPulled on\tDatabase\tST\tlocus_1\tlocus_2\tlocus_3\tlocus_4\tlocus_5\tlocus_6\tlocus_7\tlocus_8\tlocus_9\tlocus_10\n")
+		writer.write("Sample\tSource\tPulled_on\tDatabase\tST\tlocus_1\tlocus_2\tlocus_3\tlocus_4\tlocus_5\tlocus_6\tlocus_7\tlocus_8\tlocus_9\tlocus_10\n")
 		if len(checked_and_deduped_schemes) == 0:
 			print("No schemes found")
 			writer.write(isolate_name+"\tNone-"+genus+" "+species+"\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t")