Merge pull request #103 from CDCgov/v1.1.1

v1.1.1
CDCgov · Mar 22, 2023 · f5b1ca8 · f5b1ca8
2 parents ff99131 + a619538
commit f5b1ca8
Show file tree

Hide file tree

Showing 17 changed files with 1,475 additions and 779 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -52,3 +52,19 @@ Below are the list of changes to phx since is initial release. As fixes can take
 - BBTools updated from 38.96 to [39.01](https://sourceforge.net/projects/bbmap/).  
 - AMRFinder+ was updated from 3.10.40 to [3.10.45](https://github.com/ncbi/amr/releases/tag/amrfinder_v3.10.45).  
 - Scripts the utilize the phoenix_base container were updated to `quay.io/jvhagey/phoenix:base_v1.1.0` which had the python library `xlsxwriter` added to it for [`GRiPHin.py`](https://github.com/CDCgov/phoenix/blob/v1.0.1/bin/GRiPHin.py).  
+
+## [v1.1.1](https://github.com/CDCgov/phoenix/releases/tag/v1.1.1) (03/21/2023)
+
+[Full Changelog](https://github.com/CDCgov/phoenix/compare/v1.1.0...v1.1.1)
+
+**Implemented Enhancements:**
+- `-entry CDC_PHOENIX` workflow checks all FASTQ files for corruption and creates a list of the checked files usng the FAIry (FASTQ file Assesment of Integrity) tool [commit 1111df8](https://github.com/CDCgov/phoenix/commit/651aafe6a9459e5471ce4e4efc164587170fee62). This is a required internal QC check.  
+- Expanded MLST lookup of *Citrobacter* species complex [commit 43ea24d](https://github.com/CDCgov/phoenix/commit/43ea24d0206946eb9fc90e8303fc46353e6b719b) lists the new species.  
+- Increased SPAdes CPUs to 8 and memory to 16GB in `base.config`.  
+
+**Fixed Bugs:**  
+- Fix for issue [#99](https://github.com/CDCgov/phoenix/issues/99) where first gene in ar, plasmid and hypervirulence genes didn't end up in the `*_summaryline.tsv`. This same error was in `Phoenix_summary_line.py` that caused the first sample to not be include in the final report.  
+- Fixed tabulation error into `*_combined.tsv` output files that in some cases would show in `GRiPHin_Report.xlsx` output as a long singular line as the MLST type.  
+- Fix for issue [#91](https://github.com/CDCgov/phoenix/issues/91) where Klebsiella MLST lookup would not properly match to the correct lookup database.  
+- Fixed problem where samples that didn't create scaffolds, but created contigs didn't have species printed out in `Phoenix_Output_Report.tsv` details in [commit c7f7ea5](https://github.com/CDCgov/phoenix/commit/c7f7ea5bd42a0e2010e0b15e4b4f7e9119d394a2).  
+- Fixed problem in `-entry CDC_PHOENIX` where samples that didn't create scaffolds, but created contigs or samples that failed spades completely didn't have correct columns lining up in `Phoenix_Output_Report.tsv` details in [commit d17bdda](https://github.com/CDCgov/phoenix/commit/d17bdda89cf4d89aebe02a53082e5bb72c33582f).  
diff --git a/bin/Create_phoenix_summary_tsv.py b/bin/Create_phoenix_summary_tsv.py
@@ -30,7 +30,7 @@ def List_TSV(output_file, input_list, busco):
             input_list_sorted=sorted(input_list)
         for entry in input_list_sorted:
             with open(entry, "r") as f2:
-                header = next(f2) # skip the first line of the samplesheet
+                next(f2) # skip the first line of the samplesheet
                 for line in f2:
                     f.write(line + '\n')
 

diff --git a/bin/Phoenix_summary_line.py b/bin/Phoenix_summary_line.py
@@ -27,6 +27,7 @@ def parseArgs(args=None):
     parser.add_argument('-v', '--vir', required=False, help='hypervirulence GAMMA file')
     parser.add_argument('-k', '--kraken_trim', dest="trimd_kraken", required=False, help='trimd_summary.txt from kraken2')
     parser.add_argument('-s', '--stats', dest="stats", required=False, help='Pipeline Stats file synopsis file')
+    parser.add_argument('-e', '--extended_qc', dest="extended_qc", default=False, action='store_true', help='Pass to make true for -entry cdc pipelines') # Need this for when you call -entry CDC_PHOENIX or CDC_SCAFFOLDS, but spades fails
     parser.add_argument('-o', '--out', required=True, help='output file name')
     return parser.parse_args()
 
@@ -177,18 +178,6 @@ def Assembly_Ratio_Length(ratio_file):
     Out = int(Length)
     return Out
 
-def Assembly_Ratio_Species(ratio_file):
-    f = open(ratio_file, 'r')
-    String1 = f.readline()
-    Species = 'Unknown'
-    while String1 != '':
-        if ('Tax:' in String1):
-            Species = String1.split()[1:]
-            Species = ' '.join(Species)
-        String1 = f.readline()
-    f.close()
-    return Species
-
 def Trimmed_BP(trimmed_counts_file):
     f = open(trimmed_counts_file, 'r')
     String1 = f.readline()
@@ -205,9 +194,8 @@ def Trim_Coverage(trimmed_counts_file, ratio_file):
 
 def Bla_Genes(input_gamma):
     with open(input_gamma, 'r') as f:
-        header=next(f) # just use to skip first line
+        next(f) # just use to skip first line
         Bla = []
-        String1 = f.readline()
         for line in f:
             Cat = line.split('\t')[0].split('__')[4] # Drug category
             Gene = line.split('\t')[0].split('__')[2] # Gene Name
@@ -224,9 +212,8 @@ def Bla_Genes(input_gamma):
 
 def Non_Bla_Genes(input_gamma):
     with open(input_gamma, 'r') as f:
-        header=next(f) # just use to skip first line
+        next(f) # just use to skip first line
         Non_Bla = []
-        String1 = f.readline()
         for line in f:
             Cat = line.split('\t')[0].split('__')[4] # Drug category
             Gene = line.split('\t')[0].split('__')[2] # Gene Name
@@ -243,9 +230,8 @@ def Non_Bla_Genes(input_gamma):
 
 def HV_Genes(input_gamma):
     with open(input_gamma, 'r') as f:
-        header=next(f) # just use to skip first line
+        next(f) # just use to skip first line
         HV = []
-        String1 = f.readline()
         for line in f:
             Gene = line.split('\t')[0]
             HV.append(Gene)
@@ -327,7 +313,14 @@ def Get_Taxa_Source(taxa_file):
             #percent_match = re.findall(r'-.*?-', first_line)[0]
             #percent_match = re.sub( "-", '', percent_match)
             percent_match = percent_match + "% Scaffolds_assigned"
-    return taxa_source, percent_match
+        lines = f.readlines()
+        for line in lines:
+            if line.startswith("G:"):
+                genus = line.replace("G:	","").strip('\n')
+            if line.startswith("s:"):
+                species = line.replace("s:	","").strip('\n')
+        Species = genus + " " + species
+    return taxa_source, percent_match, Species
 
 def Get_Mutations(amr_file):
     point_mutations_list = []
@@ -347,7 +340,7 @@ def Get_Mutations(amr_file):
 def Get_Plasmids(pf_file):
     plasmid_marker_list = []
     with open(pf_file, 'r') as f:
-        header=next(f) # just use to skip first line
+        next(f) # just use to skip first line
         for line in f:
             Gene = line.split('\t')[0]
             Percent_Length = float(line.split('\t')[14])*100
@@ -374,10 +367,10 @@ def Get_BUSCO_Gene_Count(stats):
         percent=str(split_list[2].split("%")[0].strip())+"%"
         ratio="("+str(split_list[2].split("(")[1].strip())
         busco_line = percent + ' ' + ratio
-    busco_file = True
+    busco_file = "True"
     return busco_line, lineage, busco_file
 
-def Isolate_Line(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_file, gamma_ar, gamma_hv, stats, trimd_kraken, amr_file, pf_file):
+def Isolate_Line(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_file, gamma_ar, gamma_hv, stats, trimd_kraken, amr_file, pf_file, extended_qc):
     try:
         plasmid_marker_list = Get_Plasmids(pf_file)
     except:
@@ -386,11 +379,12 @@ def Isolate_Line(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_file, ga
         point_mutations_list = Get_Mutations(amr_file)
     except:
         point_mutations_list = 'Unknown'
-    try:
-        taxa_source, percent_match = Get_Taxa_Source(Taxa)
-    except:
-        taxa_source = 'Unknown'
-        percent_match = 'Unknown'
+    #try:
+    taxa_source, percent_match, Species = Get_Taxa_Source(Taxa)
+    #except:
+    #    taxa_source = 'Unknown'
+    #    percent_match = 'Unknown'
+    #    Species = 'Unknown'
     try:
         Coverage = Trim_Coverage(trimmed_counts, ratio_file)
     except:
@@ -417,10 +411,7 @@ def Isolate_Line(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_file, ga
         GC = GC_Content(quast_file)
     except:
         GC = 'Unknown'
-    try:
-        Species = Assembly_Ratio_Species(ratio_file)
-    except:
-        Species = 'Unknown'
+    # Check the taxa_source to determine where to get Species from
     # try:
     #     ST = MLST_ST(MLST_file)
     #     if len(ST) > 1:
@@ -515,17 +506,17 @@ def Isolate_Line(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_file, ga
         read_match = Get_Kraken_reads(stats, trimd_kraken)
     except:
         read_match = "Unknown"
-    if busco_file is None:
+    if busco_file is None and extended_qc == False:
         Line = ID + '\t' + QC_Outcome + '\t' + warning_count + '\t'  + Coverage + '\t' + Genome_Length + '\t' + Ratio + '\t' + Contigs + '\t' + GC + '\t' + Species + '\t' + percent_match + '\t' + taxa_source + '\t' + read_match + '\t' + scaffold_match + '\t' + MLST_scheme_1 + '\t' + MLST_type_1 + '\t' + MLST_scheme_2 + '\t' + MLST_type_2 + '\t' + Bla + '\t' + Non_Bla + '\t' + point_mutations_list + '\t' + HV + '\t' + plasmid_marker_list + '\t' + Reason
         busco = False
-    elif busco_file is not None:
+    elif busco_file is not None or extended_qc == True:
         Line = ID + '\t' + QC_Outcome + '\t' + warning_count + '\t'  + Coverage + '\t' + Genome_Length + '\t' + Ratio + '\t' + Contigs + '\t' + GC + '\t' + busco_line + '\t' + lineage + '\t' + Species + '\t' + percent_match + '\t' + taxa_source + '\t' + read_match + '\t' + scaffold_match + '\t' + MLST_scheme_1 + '\t' + MLST_type_1 + '\t' + MLST_scheme_2 + '\t' + MLST_type_2 + '\t' + Bla + '\t' + Non_Bla + '\t' + point_mutations_list + '\t' + HV + '\t' + plasmid_marker_list + '\t' + Reason
         busco = True
     return Line, busco
 
-def Isolate_Line_File(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_file, gamma_ar, gamma_hv, out_file, stats, trimd_kraken, mutations, pf_file):
+def Isolate_Line_File(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_file, gamma_ar, gamma_hv, out_file, stats, trimd_kraken, mutations, pf_file, extended_qc):
     with open(out_file, 'w') as f:
-        Line, busco = Isolate_Line(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_file, gamma_ar, gamma_hv, stats, trimd_kraken, mutations, pf_file)
+        Line, busco = Isolate_Line(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_file, gamma_ar, gamma_hv, stats, trimd_kraken, mutations, pf_file, extended_qc)
         if busco == True:
             f.write('ID\tAuto_QC_Outcome\tWarning_Count\tEstimated_Coverage\tGenome_Length\tAssembly_Ratio_(STDev)\t#_of_Scaffolds_>500bp\tGC_%\tBUSCO\tBUSCO_DB\tSpecies\tTaxa_Confidence\tTaxa_Source\tKraken2_Trimd\tKraken2_Weighted\tMLST_Scheme_1\tMLST_1\tMLST_Scheme_2\tMLST_2\tGAMMA_Beta_Lactam_Resistance_Genes\tGAMMA_Other_AR_Genes\tAMRFinder_Point_Mutations\tHypervirulence_Genes\tPlasmid_Incompatibility_Replicons\tAuto_QC_Failure_Reason\n')
         else:
@@ -535,7 +526,7 @@ def Isolate_Line_File(Taxa, ID, trimmed_counts, ratio_file, MLST_file, quast_fil
 def main():
     args = parseArgs()
     # if the output file already exists remove it
-    Isolate_Line_File(args.taxa, args.name, args.trimmed, args.ratio, args.mlst, args.quast, args.ar, args.vir, args.out, args.stats, args.trimd_kraken, args.mutations, args.pf)
+    Isolate_Line_File(args.taxa, args.name, args.trimmed, args.ratio, args.mlst, args.quast, args.ar, args.vir, args.out, args.stats, args.trimd_kraken, args.mutations, args.pf, args.extended_qc)
 
 if __name__ == '__main__':
     main()
diff --git a/bin/beforeSpades.sh b/bin/beforeSpades.sh
@@ -24,7 +24,7 @@ show_help () {
 
 # Parse command line options
 options_found=0
-while getopts ":h?d:n:k:s:" option; do
+while getopts ":h?d:n:k:s:c" option; do
 	options_found=$(( options_found + 1 ))
 	case "${option}" in
 		\?)
@@ -44,6 +44,9 @@ while getopts ":h?d:n:k:s:" option; do
 		s)
 			echo "Option -s triggered, argument = ${OPTARG}"
 			synopsis=${OPTARG};;
+		c)
+			echo "Option -c triggered"
+			cdc_extended_qc="true";;
 		:)
 			echo "Option -${OPTARG} requires as argument";;
 		h)
@@ -64,7 +67,19 @@ species_col=$(echo "${genus} ${species}")
 #get the number of warnings in the synopsis file
 warning_count=$(grep ": WARNING  :" $synopsis | wc -l)
 
-echo "${sample_name}\tFAIL\t${warning_count}\tUnknown\tUnknown\tUnknown\tUnknown\tUnknown\t${species_col}\tUnknown\tkraken2_trimmed\t${name}\tUnknown\tUnknown\tUnknown\tUnknown\tUnknown\tUnknown\tUnknown\tUnknown\tUnknown\tUnknown\tSPAdes_Failure" | tr -d '\n' > ${sample_name}_summaryline_failure.tsv
+if [[ "${cdc_extended_qc}" == "true" ]]; then
+	#for cdc_phoenix or cdc_scaffolds entry
+	echo "ID	Auto_QC_Outcome	tWarning_Count	Estimated_Coverage	Genome_Length	Assembly_Ratio_(STDev)	#_of_Scaffolds_>500bp	GC_%	Species	Taxa_Confidence	Taxa_Source	Kraken2_Trimd	Kraken2_Weighted	MLST_Scheme_1	MLST_1	MLST_Scheme_2	MLST_2	GAMMA_Beta_Lactam_Resistance_Genes	GAMMA_Other_AR_Genes	AMRFinder_Point_Mutations	Hypervirulence_Genes	Plasmid_Incompatibility_Replicons	Auto_QC_Failure_Reason" > ${sample_name}_summaryline_failure.tsv
+	#file contents
+	echo "${sample_name}	FAIL	${warning_count}	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	${species_col}	${spercent}% Reads_assigned	kraken2_trimmed	${name}	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	SPAdes_Failure" | tr -d '\n' >> ${sample_name}_summaryline_failure.tsv
+else
+	#for phoenix or scaffolds entry
+	#header
+	echo "ID	Auto_QC_Outcome	tWarning_Count	Estimated_Coverage	Genome_Length	Assembly_Ratio_(STDev)	#_of_Scaffolds_>500bp	GC_%	Species	Taxa_Confidence	Taxa_Source	Kraken2_Trimd	Kraken2_Weighted	MLST_Scheme_1	MLST_1	MLST_Scheme_2	MLST_2	GAMMA_Beta_Lactam_Resistance_Genes	GAMMA_Other_AR_Genes	AMRFinder_Point_Mutations	Hypervirulence_Genes	Plasmid_Incompatibility_Replicons	Auto_QC_Failure_Reason" > ${sample_name}_summaryline_failure.tsv
+	#file contents
+	echo "${sample_name}	FAIL	${warning_count}	Unknown	Unknown	Unknown	Unknown	Unknown	${species_col}	${spercent}% Reads_assigned	kraken2_trimmed	${name}	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown	SPAdes_Failure" | tr -d '\n' >> ${sample_name}_summaryline_failure.tsv
+fi
+
 cp ${sample_name}_summaryline_failure.tsv ${output_path}/${sample_name}/
 # copy the synopsis file
 cp ${sample_name}.synopsis ${output_path}/${sample_name}