adjust outputs to mirror existing

theiagen · Jun 27, 2024 · aaafeb8 · aaafeb8
1 parent c2f4a06
commit aaafeb8
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 13 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-ARG MERCURY_VER="1.0.1"
+ARG MERCURY_VER="1.0.2"
 
 FROM google/cloud-sdk:480.0.0-slim
 

diff --git a/README.md b/README.md
@@ -27,16 +27,16 @@ Default databases by organism:
 We highly recommend using the following Docker image to run Mercury:
 
 ```bash
-docker pull us-docker.pkg.dev/general-theiagen/theiagen/mercury:1.0.1
+docker pull us-docker.pkg.dev/general-theiagen/theiagen/mercury:1.0.2
 ```
 
 The entrypoint for this Docker image is the Mercury help message. To run this container interactively, use the following command:
 
 ```bash
-docker run -it --entrypoint=/bin/bash us-docker.pkg.dev/general-theiagen/theiagen/mercury:1.0.1
+docker run -it --entrypoint=/bin/bash us-docker.pkg.dev/general-theiagen/theiagen/mercury:1.0.2
 # Once inside the container interactively, you can run the tbp-parser tool
 python3 /mercury/mercury/mercury.py -v
-# v1.0.1
+# v1.0.2
 ```
 
 ### Locally with Python

diff --git a/mercury/Metadata.py b/mercury/Metadata.py
@@ -40,8 +40,14 @@ def bankit_metadata(self):
 
   def sra_metadata(self):
     self.logger.debug("METADATA:Retrieving SRA metadata")
-    sra_required = ["bioproject_accession", "submission_id", "library_id", "organism", "isolation_source", "library_strategy", "library_source", "library_selection", "library_layout", "seq_platform", "instrument_model", "filetype"]
-    sra_optional = ["platform", "title", "design_description", "amplicon_primer_scheme", "amplicon_size", "assembly_method", "dehosting_method", "submitter_email"]
+    sra_required = ["bioproject_accession", "submission_id", "library_id", "organism", "isolation_source", "library_strategy", "library_source", "library_selection", "library_layout", "instrument_model", "filetype"]
+    sra_optional = ["title", "design_description", "amplicon_primer_scheme", "amplicon_size", "assembly_method", "dehosting_method", "submitter_email"]
+
+    # note: the flu metadata formatter currently uses "platform" instead of "seq_platform" because it uses the Terra_2_NCBI Pathogen BioSample formatter.
+    if self.organism == "flu":
+      sra_required.append("platform")
+    else:
+      sra_required.append("seq_platform")
     return sra_required, sra_optional
 
   def genbank_metadata(self):  

diff --git a/mercury/Table.py b/mercury/Table.py
@@ -141,7 +141,7 @@ def perform_quality_check(self):
 
     self.table.drop(self.table.index[self.table["vadr_num_alerts"].astype(str).str.contains("VADR skipped due to poor assembly")], inplace=True)
     self.table.drop(self.table.index[self.table["vadr_num_alerts"].astype(int) > self.vadr_alert_limit], inplace=True)
-    self.table.drop(self.table.index[self.table["number_N"].astype(int) > self.number_n_threshold], inplace=True)
+    self.table.drop(self.table.index[self.table["number_n"].astype(int) > self.number_n_threshold], inplace=True)
     self.table.drop(self.table.index[self.table["year"].isna()], inplace=True)
 
   def split_metadata(self):
@@ -248,10 +248,10 @@ def make_genbank_csv(self):
       else:
         genbank_metadata[column] = ""
 
-    genbank_metadata.rename(columns={"submission_id" : "Sequence_ID", "host_sci_name" : "host", "collection_date" : "collection-date", "isolation_source" : "isolation-source", "biosample_accession" : "BioSample", "bioproject_accession" : "BioProject"}, inplace=True)
+    genbank_metadata.rename(columns={"submission_id" : "Sequence_ID", "host_sci_name" : "host", "collection_date" : "collection-date", "isolation_source" : "isolation-source", "biosample_accession" : "BioSample", "bioproject_accession" : "BioProject", "country" : "geo_loc_name"}, inplace=True)
 
     if update_country:
-      genbank_metadata["country"] = genbank_metadata["country"] + ": " + genbank_metadata["state"]
+      genbank_metadata["geo_loc_name"] = genbank_metadata["geo_loc_name"] + ": " + genbank_metadata["state"]
 
     # remove state column from genbank
     genbank_metadata.drop("state", axis=1, inplace=True)
@@ -392,10 +392,10 @@ def make_gisaid_csv(self):
 
 
     self.logger.debug("TABLE:Now preparing the command to rewrite the header of every fasta file to the preferred format")
-    gisaid_metadata["new_filenames"] = gisaid_metadata["submission_id"] + "_gisaid.fasta"
-    assembly_tuples = list(zip(self.table[self.assembly_fasta_column_name], gisaid_metadata["new_filenames"], gisaid_metadata["gisaid_virus_name"]))
+    gisaid_metadata["fn"] = gisaid_metadata["submission_id"] + "_gisaid.fasta"
+    assembly_tuples = list(zip(self.table[self.assembly_fasta_column_name], gisaid_metadata["fn"], gisaid_metadata["gisaid_virus_name"]))
 
-    gisaid_metadata.drop(["submission_id", "new_filenames"], axis=1, inplace=True)
+    gisaid_metadata.drop(["submission_id"], axis=1, inplace=True)
 
     self.logger.debug("TABLE:Writing GISAID metadata out to a file")
     gisaid_metadata.rename(columns=gisaid_rename_headers, inplace=True)

diff --git a/mercury/__init__.py b/mercury/__init__.py
@@ -1 +1 @@
-__VERSION__ = "v1.0.1"
+__VERSION__ = "v1.0.2"