From 7bbf065162c16e1d7e1d953e1fcd646ea05e7782 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Fri, 9 Feb 2024 18:46:52 -0500
Subject: [PATCH 01/15] Create codeql-analysis.yml (#25)
From 891eec087ddbe7eea4c06f8ad1c0bebbabac3e23 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Fri, 9 Feb 2024 18:53:47 -0500
Subject: [PATCH 02/15] Create codeql.yml (#29)
---
.github/workflows/codeql.yml | 84 ++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
create mode 100644 .github/workflows/codeql.yml
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000..9f2f022
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,84 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+ push:
+ branches: [ "master" ]
+ pull_request:
+ branches: [ "master" ]
+ schedule:
+ - cron: '43 3 * * 5'
+
+jobs:
+ analyze:
+ name: Analyze
+ # Runner size impacts CodeQL analysis time. To learn more, please see:
+ # - https://gh.io/recommended-hardware-resources-for-running-codeql
+ # - https://gh.io/supported-runners-and-hardware-resources
+ # - https://gh.io/using-larger-runners
+ # Consider using larger runners for possible analysis time improvements.
+ runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+ timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
+ permissions:
+ # required for all workflows
+ security-events: write
+
+ # only required for workflows in private repositories
+ actions: read
+ contents: read
+
+ strategy:
+ fail-fast: false
+ matrix:
+ language: [ 'python' ]
+ # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ]
+ # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both
+ # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
+ # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ # Initializes the CodeQL tools for scanning.
+ - name: Initialize CodeQL
+ uses: github/codeql-action/init@v3
+ with:
+ languages: ${{ matrix.language }}
+ # If you wish to specify custom queries, you can do so here or in a config file.
+ # By default, queries listed here will override any specified in a config file.
+ # Prefix the list here with "+" to use these queries and those in the config file.
+
+ # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+ # queries: security-extended,security-and-quality
+
+
+ # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
+ # If this step fails, then you should remove it and run the build manually (see below)
+ - name: Autobuild
+ uses: github/codeql-action/autobuild@v3
+
+ # ℹ️ Command-line programs to run using the OS shell.
+ # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+
+ # If the Autobuild fails above, remove it and uncomment the following three lines.
+ # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
+
+ # - run: |
+ # echo "Run, Build Application using script"
+ # ./location_of_script_within_repo/buildscript.sh
+
+ - name: Perform CodeQL Analysis
+ uses: github/codeql-action/analyze@v3
+ with:
+ category: "/language:${{matrix.language}}"
From d1fdf37db23748818bfc8d1d5321df7c304401b4 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Mon, 12 Feb 2024 14:04:29 -0500
Subject: [PATCH 03/15] Dthoward96 workflow correction (#30)
* Delete .github/workflows/codeql-analysis.yml
* Delete .github/workflows/python-package-mamba.yml
---
.github/workflows/codeql-analysis.yml | 72 ----------------------
.github/workflows/python-package-mamba.yml | 24 --------
2 files changed, 96 deletions(-)
delete mode 100644 .github/workflows/codeql-analysis.yml
delete mode 100644 .github/workflows/python-package-mamba.yml
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
deleted file mode 100644
index 2c77411..0000000
--- a/.github/workflows/codeql-analysis.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-# For most projects, this workflow file will not need changing; you simply need
-# to commit it to your repository.
-#
-# You may wish to alter this file to override the set of languages analyzed,
-# or to provide custom queries or build logic.
-#
-# ******** NOTE ********
-# We have attempted to detect the languages in your repository. Please check
-# the `language` matrix defined below to confirm you have the correct set of
-# supported CodeQL languages.
-#
-name: "CodeQL"
-
-on:
- push:
- branches: [ master ]
- pull_request:
- # The branches below must be a subset of the branches above
- branches: [ master ]
- schedule:
- - cron: '40 12 * * 5'
-
-jobs:
- analyze:
- name: Analyze
- runs-on: ubuntu-latest
- permissions:
- actions: read
- contents: read
- security-events: write
-
- strategy:
- fail-fast: false
- matrix:
- language: [ 'python' ]
- # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
- # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
-
- steps:
- - name: Checkout repository
- uses: actions/checkout@v3
-
- # Initializes the CodeQL tools for scanning.
- - name: Initialize CodeQL
- uses: github/codeql-action/init@v2
- with:
- languages: ${{ matrix.language }}
- # If you wish to specify custom queries, you can do so here or in a config file.
- # By default, queries listed here will override any specified in a config file.
- # Prefix the list here with "+" to use these queries and those in the config file.
-
- # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
- # queries: security-extended,security-and-quality
-
-
- # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
- # If this step fails, then you should remove it and run the build manually (see below)
- - name: Autobuild
- uses: github/codeql-action/autobuild@v2
-
- # ℹ️ Command-line programs to run using the OS shell.
- # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
-
- # If the Autobuild fails above, remove it and uncomment the following three lines.
- # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
-
- # - run: |
- # echo "Run, Build Application using script"
- # ./location_of_script_within_repo/buildscript.sh
-
- - name: Perform CodeQL Analysis
- uses: github/codeql-action/analyze@v2
diff --git a/.github/workflows/python-package-mamba.yml b/.github/workflows/python-package-mamba.yml
deleted file mode 100644
index 80a9e3a..0000000
--- a/.github/workflows/python-package-mamba.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: Set up Mamba Environment
-
-on: [push]
-
-jobs:
- build-linux:
- runs-on: ubuntu-latest
- strategy:
- max-parallel: 5
-
- steps:
- - uses: mamba-org/setup-micromamba@v1
- - name: Set up Mamba Environment
- with:
- environment-file: env.yaml
- init-shell: >-
- bash
- powershell
- cache-environment: true
- post-cleanup: 'all'
- - name: Test with pytest
- run: |
- mamba install pytest
- pytest
From ed2402a57c513bd3367a1566b08d870091c05228 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Mon, 12 Feb 2024 17:04:36 -0500
Subject: [PATCH 04/15] Add files via upload (#31)
---
README.Rmd | 14 ++++++++++++++
README.md | 53 ++++++++++++++++++++++++++++++++++++++---------------
2 files changed, 52 insertions(+), 15 deletions(-)
diff --git a/README.Rmd b/README.Rmd
index 5de7a98..37b09a2 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -30,10 +30,20 @@ github_pages_url <- description$GITHUB_PAGES
**General Disclaimer**: This repository was created for use by CDC programs to collaborate on public health related projects in support of the [CDC mission](https://www.cdc.gov/about/organization/mission.htm). GitHub is not hosted by the CDC, but is a third party website used by CDC and its partners to share information and collaborate on software. CDC use of GitHub does not imply an endorsement of any one particular service, product, or enterprise.
+# [Documentation](`r github_pages_url`/index.html)
+
## Overview
``r program`` is a Python program that is developed to automate the process of generating necessary submission files and batch uploading them to NCBI archives (such as **BioSample**, **SRA**, and **Genbank**) and GISAID databases (e.g. **EpiFlu** and **EpiCoV**). Presently, the pipeline is capable of uploading **Influenza A Virus** (FLU) and **SARS-COV-2** (COV) data. However, the dynamic nature of this pipeline can allow for additional uploads of other organisms in future updates or requests.
+## Contacts
+
+| Role | Contact |
+| ---------- | ------- |
+| Creator | [Dakota Howard](https://github.com/dthoward96), [Reina Chau](https://github.com/rchau88) |
+| Maintainer | [Dakota Howard](https://github.com/dthoward96) |
+| Back-Up | [Reina Chau](https://github.com/rchau88), [Brian Lee](https://github.com/leebrian) |
+
## Prerequisites
- **NCBI Submissions**
@@ -93,6 +103,10 @@ Before submitters can perform a batch submission using ``r program``, they must
- [How to run seqsender with Compose](`r github_pages_url`/articles/compose_installation.html)
- [How to run seqsender with Singularity](`r github_pages_url`/articles/singularity_installation.html)
+## Code Attributions
+
+Dakota Howard and Reina Chau for majority of the code base with input and testing from [colleagues](`r github_pages_url`/authors.html).
+
## Public Domain Standard Notice
This repository constitutes a work of the United States Government and is not subject to domestic copyright protection under 17 USC § 105. This repository is in the public domain within the United States, and copyright and related rights in the work worldwide are waived through the [CC0 1.0 Universal public domain dedication](https://creativecommons.org/publicdomain/zero/1.0/). All contributions to this repository will be released under the CC0 dedication. By submitting a pull request you are agreeing to comply with this waiver of copyright interest.
diff --git a/README.md b/README.md
index b02dd5c..355b377 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,19 @@
+
+
+
+
Public Database Submission Pipeline
+
**Beta Version**: 1.1.0. This pipeline is currently in Beta testing, and
issues could appear during submission. Please use it at your own risk.
-Feedback and suggestions are welcome!
+Feedback and suggestions are welcome\!
**General Disclaimer**: This repository was created for use by CDC
programs to collaborate on public health related projects in support of
@@ -18,6 +23,8 @@ CDC and its partners to share information and collaborate on software.
CDC use of GitHub does not imply an endorsement of any one particular
service, product, or enterprise.
+# [Documentation](https://cdcgov.github.io/seqsender/index.html)
+
## Overview
`seqsender` is a Python program that is developed to automate the
@@ -29,9 +36,17 @@ A Virus** (FLU) and **SARS-COV-2** (COV) data. However, the dynamic
nature of this pipeline can allow for additional uploads of other
organisms in future updates or requests.
+## Contacts
+
+| Role | Contact |
+| ---------- | ---------------------------------------------------------------------------------------- |
+| Creator | [Dakota Howard](https://github.com/dthoward96), [Reina Chau](https://github.com/rchau88) |
+| Maintainer | [Dakota Howard](https://github.com/dthoward96) |
+| Back-Up | [Reina Chau](https://github.com/rchau88), [Brian Lee](https://github.com/leebrian) |
+
## Prerequisites
-- **NCBI Submissions**
+ - **NCBI Submissions**
`seqsender` utilizes an UI-Less Data Submission Protocol to bulk upload
submission files (e.g., *submission.xml*, *submission.zip*, etc.) to
@@ -63,11 +78,11 @@ FTP on the command line. Before attempting to submit a submission using
gb-admin@ncbi.nlm.nih.gov
to discuss requirements for submissions.
-5. Coordinate a NCBI namespace name (**spuid_namespace**) that will be
+5. Coordinate a NCBI namespace name (**spuid\_namespace**) that will be
used with Submitter Provided Unique Identifiers (**spuid**) in the
- submission. The liaison of **spuid_namespace** and **spuid** is used
- to report back assigned accessions as well as for cross-linking
- objects within submission. The values of **spuid_namespace** are up
+ submission. The liaison of **spuid\_namespace** and **spuid** is
+ used to report back assigned accessions as well as for cross-linking
+ objects within submission. The values of **spuid\_namespace** are up
to the submitter to decide but they must be unique and
well-coordinated prior to make a submission. For more information
about these two fields, see
@@ -78,7 +93,9 @@ FTP on the command line. Before attempting to submit a submission using
[GENBANK](https://cdcgov.github.io/seqsender/articles/genbank_submission.html#metadata)
metadata requirements.
-- **GISAID Submissions**
+
+
+ - **GISAID Submissions**
`seqsender` makes use of GISAID’s Command Line Interface tools to bulk
uploading meta- and sequence-data to GISAID databases. Presently, the
@@ -145,14 +162,20 @@ prepared and stored in a submission directory of choice.
## Quick Start
-- [How to run seqsender
- locally](https://cdcgov.github.io/seqsender/articles/local_installation.html)
-- [How to run seqsender with
- Docker](https://cdcgov.github.io/seqsender/articles/docker_installation.html)
-- [How to run seqsender with
- Compose](https://cdcgov.github.io/seqsender/articles/compose_installation.html)
-- [How to run seqsender with
- Singularity](https://cdcgov.github.io/seqsender/articles/singularity_installation.html)
+ - [How to run seqsender
+ locally](https://cdcgov.github.io/seqsender/articles/local_installation.html)
+ - [How to run seqsender with
+ Docker](https://cdcgov.github.io/seqsender/articles/docker_installation.html)
+ - [How to run seqsender with
+ Compose](https://cdcgov.github.io/seqsender/articles/compose_installation.html)
+ - [How to run seqsender with
+ Singularity](https://cdcgov.github.io/seqsender/articles/singularity_installation.html)
+
+## Code Attributions
+
+Dakota Howard and Reina Chau for majority of the code base with input
+and testing from
+[colleagues](https://cdcgov.github.io/seqsender/authors.html).
## Public Domain Standard Notice
From 9bda9d679cf11a64919767340451fc57396791cd Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Fri, 16 Feb 2024 11:44:56 -0500
Subject: [PATCH 05/15] bug fix process.py (#32)
Missing a return for config file function and correcting a error message to print the correct df
---
process.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/process.py b/process.py
index 6abf15d..68d4152 100644
--- a/process.py
+++ b/process.py
@@ -86,6 +86,7 @@ def get_config(config_file, database):
if type(config_dict) is dict:
try:
config_dict = config_dict['Submission']
+ return config_dict
except:
print("Error: there is no Submission information in the config file.", file=sys.stderr)
sys.exit(1)
@@ -274,7 +275,7 @@ def process_fasta_samples(metadata, fasta_file):
# Check duplicates in fasta_df
duplicated_df = fasta_df[fasta_df.duplicated(subset = ["fasta_name_orig"], keep = False)]
if not duplicated_df.empty:
- print("Error: Sequences in fasta file must be unique at: " + fasta_file + "\nDuplicate Sequences\n" + df["fasta_sequence_orig"].to_string(index=False), file=sys.stderr)
+ print("Error: Sequences in fasta file must be unique at: " + fasta_file + "\nDuplicate Sequences\n" + fasta_df["fasta_name_orig"].to_string(index=False), file=sys.stderr)
sys.exit(1)
# Validate duplicates don't appear on merge
try:
From f7cb198b79f5ab6ad902b1388485f6b5f1cf46be Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Thu, 22 Feb 2024 13:03:04 -0500
Subject: [PATCH 06/15] Create GHCR_docker.yml (#33)
* Create GHCR_docker.yml
* Update GHCR_docker.yml
Correct changes for master branch
---
.github/workflows/GHCR_docker.yml | 42 +++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)
create mode 100644 .github/workflows/GHCR_docker.yml
diff --git a/.github/workflows/GHCR_docker.yml b/.github/workflows/GHCR_docker.yml
new file mode 100644
index 0000000..bedf2ec
--- /dev/null
+++ b/.github/workflows/GHCR_docker.yml
@@ -0,0 +1,42 @@
+name: Create and publish docker image to GHCR
+
+on:
+ push:
+ branches: [ "master" ]
+
+env:
+ REGISTRY: ghcr.io
+ IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+ build-and-push-image:
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ packages: write
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v3
+
+ - name: Log into container registry
+ uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d
+ with:
+ registry: ${{ env.REGISTRY }}
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Extract Docker metadata
+ id: meta
+ uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934
+ with:
+ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+ tags: type=ref,event=branch
+
+ - name: Build and push Docker image
+ uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09
+ with:
+ context: .
+ push: true
+ tags: ${{ steps.meta.outputs.tags }}
+ labels: ${{ steps.meta.outputs.labels }}
From 0304732f8f1719921e266eb48e9f3dff12979258 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:58:09 -0500
Subject: [PATCH 07/15] FTP folder bug fix submit.py (#34)
Some FTP accounts have the folder structure /submit/Production/ instead of /Production/. This fix automatically corrects for this difference in folder structure.
---
submit.py | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/submit.py b/submit.py
index d2cfcc4..645c01e 100644
--- a/submit.py
+++ b/submit.py
@@ -48,9 +48,20 @@ def submit_ncbi(database, submission_name, submission_dir, config_dict, submissi
print("If this is not a '" + submission_type + "' submission, interrupts submission immediately.", file=sys.stdout)
print("\n"+"Connecting to NCBI FTP Server", file=sys.stdout)
print("Submission name: " + submission_name, file=sys.stdout)
- # CD to to test/production folder
+ # Check FTP folder structure either /submit/Production/ or /Production/
+ if submission_type not in ftp.nlst():
+ # Check if submit folder exists
+ if "submit" in ftp.nlst():
+ ftp.cwd("submit")
+ # If submit folder exists check if Production/Test folder exists
+ if submission_type not in ftp.nlst():
+ print("Error: Cannot find submission folder on NCBI FTP site.", file=sys.stderr)
+ sys.exit(1)
+ else:
+ print("Error: Cannot find submission folder on NCBI FTP site.", file=sys.stderr)
+ sys.exit(1)
ftp.cwd(submission_type)
- # Create submission directory if it does not exist
+ # Create submission name directory if it does not exist
if ncbi_submission_name not in ftp.nlst():
ftp.mkd(ncbi_submission_name)
# CD to submission folder
From c2645d4a1166d82bcc8c31b3ba05e2d934212349 Mon Sep 17 00:00:00 2001
From: snu3
Date: Fri, 23 Feb 2024 12:46:47 -0500
Subject: [PATCH 08/15] update template metadata required fields + check
submitting databases are valid
---
docker-compose.yaml | 9 +-------
process.py | 4 ++--
template/FLU/flu_gisaid_metadata.csv | 34 ++++++++++++++--------------
3 files changed, 20 insertions(+), 27 deletions(-)
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 76d7564..99334e5 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -6,18 +6,11 @@ x-data-volumes:
source: $HOME/Github/Testings/seqsender
target: /data
-x-seqsender-code:
- &seqsender-code
- type: bind
- source: $HOME/Github/seqsender
- target: /seqsender
-
services:
seqsender:
container_name: seqsender
- image: cdcgov/seqsender-dev:latest
+ image: cdcgov/seqsender:latest
restart: always
volumes:
- *data-volume
- - *seqsender-code
command: tail -f /dev/null
diff --git a/process.py b/process.py
index 68d4152..1f3917e 100644
--- a/process.py
+++ b/process.py
@@ -86,7 +86,6 @@ def get_config(config_file, database):
if type(config_dict) is dict:
try:
config_dict = config_dict['Submission']
- return config_dict
except:
print("Error: there is no Submission information in the config file.", file=sys.stderr)
sys.exit(1)
@@ -94,10 +93,11 @@ def get_config(config_file, database):
# Check if each database has portal information listed in the config file
for d in range(len(database)):
if submission_portals[d] not in config_dict.keys():
- print("\n"+"Error: " + database[d] + " is listed as one of the submitting databases in the command.", file=sys.stderr)
+ print("\n"+"Error: " + database[d] + " is listed as one of the submitting databases.", file=sys.stderr)
print("Error: However, there is no " + submission_portals[d] + " submission information provided in the config file.", file=sys.stderr)
print("Error: Either remove " + database[d] + " from the submitting databases or update your config file."+"\n", file=sys.stderr)
sys.exit(1)
+ return config_dict
else:
print("Error: Config file is incorrect. File must has a valid yaml format.", file=sys.stderr)
sys.exit(1)
diff --git a/template/FLU/flu_gisaid_metadata.csv b/template/FLU/flu_gisaid_metadata.csv
index 882659f..9235634 100644
--- a/template/FLU/flu_gisaid_metadata.csv
+++ b/template/FLU/flu_gisaid_metadata.csv
@@ -1,17 +1,17 @@
-sequence_name,organism, collection_date,authors,gs-Isolate_Name,gs-seq_id,gs-segment,gs-Subtype,gs-Lineage,gs-Passage_History,gs-Location,gs-province,gs-sub_province,gs-Location_Additional_info,gs-Host,gs-Host_Additional_info,gs-Submitting_Sample_Id,gs-Originating_Lab_Id,gs-Originating_Sample_Id,gs-Antigen_Character,gs-Adamantanes_Resistance_geno,gs-Oseltamivir_Resistance_geno,gs-Zanamivir_Resistance_geno,gs-Peramivir_Resistance_geno,gs-Other_Resistance_geno,gs-Adamantanes_Resistance_pheno,gs-Oseltamivir_Resistance_pheno,gs-Zanamivir_Resistance_pheno,gs-Peramivir_Resistance_pheno,gs-Other_Resistance_pheno,gs-Host_Age,gs-Host_Age_Unit,gs-Host_Gender,gs-Health_Status,gs-Note,gs-PMID
-XX-566912_PB2,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_PB2,PB2,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,Y,F,,,
-XX-566912_PB1,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_PB1,PB1,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,Y,F,,,
-XX-566912_PA,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_PA,PA,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,Y,F,,,
-XX-566912_HA,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_HA,HA,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,Y,F,,,
-XX-566912_NP,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_NP,NP,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,Y,F,,,
-XX-566912_NA,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_NA,NA,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,Y,F,,,
-XX-566912_M,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_MP,MP,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,Y,F,,,
-XX-566912_NS,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_NS,NS,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,Y,F,,,
-XX-566913_PB2,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_PB2,PB2,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,Y,M,,,
-XX-566913_PB1,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_PB1,PB1,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,Y,M,,,
-XX-566913_PA,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_PA,PA,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,Y,M,,,
-XX-566913_HA,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_HA,HA,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,Y,M,,,
-XX-566913_NP,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_NP,NP,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,Y,M,,,
-XX-566913_NA,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_NA,NA,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,Y,M,,,
-XX-566913_M,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_MP,MP,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,Y,M,,,
-XX-566913_NS,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_NS,NS,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,Y,M,,,
+sequence_name,organism, collection_date,authors,gs-Isolate_Name,gs-seq_id,gs-segment,gs-Subtype,gs-Lineage,gs-Passage_History,gs-Location,gs-province,gs-sub_province,gs-Location_Additional_info,gs-Host,gs-Host_Additional_info,gs-Submitting_Sample_Id,gs-Originating_Lab_Id,gs-Originating_Sample_Id,gs-Antigen_Character,gs-Adamantanes_Resistance_geno,gs-Oseltamivir_Resistance_geno,gs-Zanamivir_Resistance_geno,gs-Peramivir_Resistance_geno,gs-Other_Resistance_geno,gs-Adamantanes_Resistance_pheno,gs-Oseltamivir_Resistance_pheno,gs-Zanamivir_Resistance_pheno,gs-Peramivir_Resistance_pheno,gs-Other_Resistance_pheno,gs-Host_Age,gs-Collection_Month,gs-Collection_Year,gs-Host_Age_Unit,gs-Host_Gender,gs-Health_Status,gs-Note,gs-PMID
+XX-566912_PB2,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_PB2,PB2,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,,,Y,F,,,
+XX-566912_PB1,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_PB1,PB1,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,,,Y,F,,,
+XX-566912_PA,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_PA,PA,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,,,Y,F,,,
+XX-566912_HA,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_HA,HA,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,,,Y,F,,,
+XX-566912_NP,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_NP,NP,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,,,Y,F,,,
+XX-566912_NA,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_NA,NA,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,,,Y,F,,,
+XX-566912_M,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_MP,MP,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,,,Y,F,,,
+XX-566912_NS,Influenza A virus,12/28/2016,"Doe, John; Doe, Jane;",A/California/566912/2016,A/California/566912/2016_NS,NS,H3N2,,Original,United States,California,,,Human,,,3080,,,,,,,,,,,,,92,,,Y,F,,,
+XX-566913_PB2,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_PB2,PB2,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,,,Y,M,,,
+XX-566913_PB1,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_PB1,PB1,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,,,Y,M,,,
+XX-566913_PA,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_PA,PA,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,,,Y,M,,,
+XX-566913_HA,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_HA,HA,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,,,Y,M,,,
+XX-566913_NP,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_NP,NP,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,,,Y,M,,,
+XX-566913_NA,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_NA,NA,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,,,Y,M,,,
+XX-566913_M,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_MP,MP,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,,,Y,M,,,
+XX-566913_NS,Influenza A virus,11/10/2016,"Doe, John; Doe, Jane;",A/Texas/566913/2016,A/Texas/566913/2016_NS,NS,H3N2,,Original,United States,Texas,,,Human,,,3081,,,,,,,,,,,,,21,,,Y,M,,,
From d26b581c85f334e4edd9f292c3b1f781c84acae8 Mon Sep 17 00:00:00 2001
From: rchau88 <110563969+rchau88@users.noreply.github.com>
Date: Wed, 6 Mar 2024 10:48:38 -0500
Subject: [PATCH 09/15] Update process.py
removed the gs-sequence_name specified for flu
---
process.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/process.py b/process.py
index 1f3917e..b6d5abf 100644
--- a/process.py
+++ b/process.py
@@ -193,7 +193,7 @@ def read_gisaid_log(log_file, submission_status_file):
# Save submission status df
submission_status.to_csv(submission_status_file, header = True, index = False)
not_submitted = submission_status[~submission_status["gisaid_accession_epi_isl_id"].str.contains("EPI", na=False)].copy()
- return not_submitted[["gs-sample_name", "gs-sequence_name"]]
+ return not_submitted[["gs-sample_name"]]
# Check user credentials information
def check_credentials(config_dict, database):
From 6e95bd3ed75ffd340f0be28ec3def5f9e3442ecc Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Fri, 8 Mar 2024 13:42:50 -0500
Subject: [PATCH 10/15] Create docker_test_build.yml (#41)
automatic test builds dockerfile on pull request. This will prevent merging to master if Dockerfile fails to build correctly.
---
.github/workflows/docker_test_build.yml | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
create mode 100644 .github/workflows/docker_test_build.yml
diff --git a/.github/workflows/docker_test_build.yml b/.github/workflows/docker_test_build.yml
new file mode 100644
index 0000000..cd6c320
--- /dev/null
+++ b/.github/workflows/docker_test_build.yml
@@ -0,0 +1,18 @@
+name: Build test Docker image
+
+on:
+ push:
+ branches: [ "master" ]
+ pull_request:
+ branches: [ "master" ]
+
+jobs:
+
+ build:
+
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Build the Docker image
+ run: docker build . --file Dockerfile
From 3f6433de17d5cb695b254352b4a00116d6d2bec1 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:10:38 -0500
Subject: [PATCH 11/15] Dthoward96 org id patch (#42)
* Update cov_config.yaml
remove org_id from examples
* Update flu_config.yaml
remove org_id from examples
* Update create.py
remove org_id from xml creation
---
create.py | 2 --
template/COV/cov_config.yaml | 1 -
template/FLU/flu_config.yaml | 1 -
3 files changed, 4 deletions(-)
diff --git a/create.py b/create.py
index f5f2a62..501621f 100644
--- a/create.py
+++ b/create.py
@@ -111,8 +111,6 @@ def create_submission_xml(organism, database, submission_name, config_dict, meta
comment.text = config_dict["Description"]["Comment"]
# Description info including organization and contact info
organization = etree.SubElement(description, "Organization", type=config_dict["Description"]["Organization"]["@type"], role=config_dict["Description"]["Organization"]["@role"])
- if config_dict["Description"]["Organization"]["@org_id"]:
- organization.set("org_id", config_dict["Description"]["Organization"]["@org_id"])
org_name = etree.SubElement(organization, "Name")
org_name.text = config_dict["Description"]["Organization"]["Name"]
if "GENBANK" not in database:
diff --git a/template/COV/cov_config.yaml b/template/COV/cov_config.yaml
index bd8f31e..1ab6387 100644
--- a/template/COV/cov_config.yaml
+++ b/template/COV/cov_config.yaml
@@ -10,7 +10,6 @@ Submission:
Organization:
'@role': owner
'@type': institute
- '@org_id': 12345
Name: CDC
Address:
Affil: Centers for Disease Control and Prevention
diff --git a/template/FLU/flu_config.yaml b/template/FLU/flu_config.yaml
index 6a2455a..8c7ae34 100644
--- a/template/FLU/flu_config.yaml
+++ b/template/FLU/flu_config.yaml
@@ -9,7 +9,6 @@ Submission:
Organization:
'@role': owner
'@type': institute
- '@org_id': 12345
Name: CDC
Address:
Affil: Centers for Disease Control and Prevention
From de80b4862fa033b69a9fa6e481acb17e4e00f441 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Tue, 12 Mar 2024 16:11:15 -0400
Subject: [PATCH 12/15] Dthoward96 bugfix (#45)
* Update report.py
Bug fix to allow for other submit folders in FTP of ncbi
* Update process.py
bug fix for capitalization of folder name
---
process.py | 2 +-
report.py | 11 ++++++++++-
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/process.py b/process.py
index b6d5abf..22467c6 100644
--- a/process.py
+++ b/process.py
@@ -308,7 +308,7 @@ def update_submission_status(submission_dir, submission_name, organism, test):
if test == True:
submission_type = "Test"
else:
- submission_type = "production"
+ submission_type = "Production"
# Check if given organism exist in the log
df_partial = df.loc[(df["Organism"] == organism) & (df["Submission_Name"] == submission_name) & (df["Submission_Directory"] == submission_dir) & (df["Submission_Type"] == submission_type)]
if df_partial.shape[0] == 0:
diff --git a/report.py b/report.py
index 348244b..dfabf8d 100644
--- a/report.py
+++ b/report.py
@@ -40,7 +40,16 @@ def get_ncbi_process_report(database, submission_name, submission_files_dir, con
FTP_HOST = process.get_main_config()["PORTAL_NAMES"]["NCBI"]["FTP_HOST"]
ftp = ftplib.FTP(FTP_HOST)
ftp.login(user=config_dict["Username"], passwd=config_dict["Password"])
- # CD to to test or production folder
+ # Check if submit folder exists
+ if "submit" in ftp.nlst():
+ ftp.cwd("submit")
+ # If submit folder exists check if Production/Test folder exists
+ if submission_type not in ftp.nlst():
+ print("Error: Cannot find submission folder on NCBI FTP site.", file=sys.stderr)
+ sys.exit(1)
+ else:
+ print("Error: Cannot find submission folder on NCBI FTP site.", file=sys.stderr)
+ sys.exit(1)
ftp.cwd(submission_type)
# Check if submission name exists
if ncbi_submission_name not in ftp.nlst():
From 998065c4140df580ef599466ca3402fe5ae5ecb7 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Wed, 3 Apr 2024 12:57:50 -0400
Subject: [PATCH 13/15] Add files via upload (#49)
Changes requirement from only isolate to require either strain or isolate for BioSample and GenBank
---
process.py | 39 +++++++++++++++++++++++++--------------
1 file changed, 25 insertions(+), 14 deletions(-)
diff --git a/process.py b/process.py
index 22467c6..9ac66c9 100644
--- a/process.py
+++ b/process.py
@@ -65,7 +65,7 @@ def get_required_colnames(database, organism):
if len(database_list) > 0:
# Get all common fields across all databases in a portal
if "COMMON_FIELDS" in list(main_config["PORTAL_NAMES"][portal].keys()):
- all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["COMMON_FIELDS"].keys())
+ all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["COMMON_FIELDS"].keys())
# Get required fields for given organism
if organism in list(main_config["PORTAL_NAMES"][portal]["DATABASE"].keys()):
all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"][organism].keys())
@@ -74,7 +74,7 @@ def get_required_colnames(database, organism):
if database_name in list(main_config["PORTAL_NAMES"][portal]["DATABASE"].keys()):
all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"][database_name].keys())
# Extract the unique metadata fields
- return set(all_required_colnames)
+ return set(all_required_colnames)
# Check the config file
def get_config(config_file, database):
@@ -98,7 +98,7 @@ def get_config(config_file, database):
print("Error: Either remove " + database[d] + " from the submitting databases or update your config file."+"\n", file=sys.stderr)
sys.exit(1)
return config_dict
- else:
+ else:
print("Error: Config file is incorrect. File must has a valid yaml format.", file=sys.stderr)
sys.exit(1)
@@ -120,11 +120,23 @@ def get_metadata(database, organism, metadata_file):
required_date_colnames = list(filter(lambda x: ("&" in x)==True, db_required_colnames))
# Obtain the real required column names without the asterisks and & signs
required_colnames = [re.sub("[*?#&]", "", x) for x in db_required_colnames]
+ # Remove ISOLATE FROM REQUIRED COLNAMES FOR TEMP FIX
+ required_colnames = [x for x in required_colnames if "-isolate" not in x]
# Check if required column names are existed in metadata file
if not set(required_colnames).issubset(set(metadata.columns)):
failed_required_colnames = list(filter(lambda x: (x in metadata.columns)==False, required_colnames))
print("Error: Metadata file must have the following required column names: " + ", ".join(failed_required_colnames), file=sys.stderr)
sys.exit(1)
+ ################# TEMPORARY FIX ###################
+ # Temporary fix to require either isolate or strain field not both
+ if "BIOSAMPLE" in database:
+ if "bs-isolate" not in metadata and "bs-strain" not in metadata:
+ print("Error: Metadata file must have one of these required columns: \"bs-isolate\" or \"bs-strain\".", file=sys.stderr)
+ sys.exit(1)
+ if "GENBANK" in database:
+ if "src-isolate" not in metadata and "src-strain" not in metadata:
+ print("Error: Metadata file must have one of these required columns: \"src-isolate\" or \"src-strain\".", file=sys.stderr)
+ sys.exit(1)
# Run some checks to make sure the required column fields are populated correctly
for name in required_colnames:
# Make sure specific fields have a correct date format
@@ -171,12 +183,12 @@ def read_gisaid_log(log_file, submission_status_file):
if "epi_isl".upper() in line.upper():
column_name = "gs-sample_name"
sample_name = list(set(filter(lambda x: (x.upper() in line.upper())==True, submission_status[column_name])))
- accession_id = "epi_isl_id"
+ accession_id = "epi_isl_id"
accession = re.search("EPI_ISL_[1-9]+", line)
elif "epi_id".upper() in line.upper():
column_name = "gs-sequence_name"
sample_name = list(set(filter(lambda x: (x.upper() in line.upper())==True, submission_status[column_name])))
- accession_id = "epi_id"
+ accession_id = "epi_id"
accession = re.search("EPI[1-9]+", line)
else:
continue
@@ -327,10 +339,10 @@ def update_submission_status(submission_dir, submission_name, organism, test):
for database_name in database:
print("\n" + "Submission database: " + database_name, file=sys.stdout)
df = pd.read_csv(submission_log_file, header = 0, dtype = str, engine = "python", encoding="utf-8", index_col=False).sort_values('Submission_Position', ascending=True)
- df_processing = df[(df["Organism"] == organism) & (df["Database"] == database_name) & (df["Submission_Directory"] == submission_dir) & (df["Submission_Name"] == submission_name) & (df["Submission_Type"] == submission_type)]
+ df_processing = df[(df["Organism"] == organism) & (df["Database"] == database_name) & (df["Submission_Directory"] == submission_dir) & (df["Submission_Name"] == submission_name) & (df["Submission_Type"] == submission_type)]
df_processing = df_processing.reset_index(drop=True)
- submission_dir = df_processing["Submission_Directory"][0]
- submission_position = df_processing["Submission_Position"][0]
+ submission_dir = df_processing["Submission_Directory"][0]
+ submission_position = df_processing["Submission_Position"][0]
submission_id, submission_status = df_processing["Submission_Status"][0].strip().split(";")
config_file = df_processing["Config_File"][0]
table2asn = df_processing["Table2asn"][0]
@@ -358,11 +370,11 @@ def update_submission_status(submission_dir, submission_name, organism, test):
print("There is no GISAID CLI package for " + organism + " located at "+ gisaid_cli, file=sys.stderr)
print("Please download the CLI package from GISAID platform", file=sys.stderr)
print("Then place a copy of the CLI binary at "+ gisaid_cli, file=sys.stderr)
- sys.exit(1)
+ sys.exit(1)
# Check the status of the submission
if "processed-ok" in submission_status:
print("Submission status: " + submission_status, file=sys.stdout)
- else:
+ else:
# Pull download submission report and update its status
if database_name in ["BIOSAMPLE", "SRA", "GENBANK"]:
# If report exists, processing the report and output status of the submission
@@ -384,8 +396,8 @@ def update_submission_status(submission_dir, submission_name, organism, test):
for db in other_submitting_db:
db_df = df.loc[df["Database"] == db]
db_df = db_df.reset_index(drop=True)
- db_status = db_df["Submission_Status"][0]
- # If the status of biosample or sra is processed-ok, then go ahead and submit to Genbank
+ db_status = db_df["Submission_Status"][0]
+ # If the status of biosample or sra is processed-ok, then go ahead and submit to Genbank
if "processed-ok" in db_status:
all_status += [1]
report.update_genbank_files(database=database, organism=organism, submission_files_dir=submission_files_dir, submission_status_file=submission_status_file)
@@ -419,9 +431,8 @@ def update_submission_status(submission_dir, submission_name, organism, test):
if "processed-ok" in db_status:
report.update_gisaid_files(organism=organism, submission_files_dir=submission_files_dir, submission_status_file=submission_status_file)
submission_status = submit.submit_gisaid(organism=organism, database=database_name, submission_dir=submission_dir, submission_name=submission_name, config_dict=config_dict["GISAID"], gisaid_cli=gisaid_cli, submission_status_file=submission_status_file, submission_type=submission_type)
- submission_id = ""
+ submission_id = ""
# Update status in the submission log
create.create_submission_log(database=database_name, submission_position=submission_position, organism=organism, submission_name=submission_name, submission_dir=submission_dir, config_file=config_file, submission_status=submission_status, submission_id=submission_id, table2asn=table2asn, gff_file=gff_file, submission_type=submission_type)
# Print out the submission status
print("Submission status: " + submission_status, file=sys.stdout)
-
From c61812bb7f436209d287ef5d6eaf652c31c04530 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:04:56 -0400
Subject: [PATCH 14/15] create.py duplicate strain name bug (#50)
Fixes issue that creates duplicate strain columns when using src-strain
---
create.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/create.py b/create.py
index 501621f..e3ec2c1 100644
--- a/create.py
+++ b/create.py
@@ -399,7 +399,7 @@ def create_genbank_files(organism, config_dict, metadata, fasta_file, submission
# Retrieve the source df"
source_df = metadata.filter(regex="^gb-seq_id$|^src-|^ncbi-spuid$|^ncbi-bioproject$|^organism$|^collection_date$").copy()
source_df.columns = source_df.columns.str.replace("src-","").str.strip()
- source_df = source_df.rename(columns = {"gb-seq_id":"Sequence_ID", "collection_date":"Collection_date", "ncbi-spuid":"strain"})
+ source_df = source_df.rename(columns = {"gb-seq_id":"Sequence_ID", "collection_date":"Collection_date"})
# Add BioProject if available
if "ncbi-bioproject" in source_df:
source_df = source_df.rename(columns={"ncbi-bioproject": "BioProject"})
From 67d14c9b31a0eb6d4087d67cc222f69c8a87bb87 Mon Sep 17 00:00:00 2001
From: Dakota Howard <58985143+dthoward96@users.noreply.github.com>
Date: Fri, 5 Apr 2024 17:29:11 -0400
Subject: [PATCH 15/15] Bug fix process.py (#51)
Bug fix for upload log. When only one database was submitted it, it would convert the database name into a list and error out.
---
process.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/process.py b/process.py
index 9ac66c9..23d3585 100644
--- a/process.py
+++ b/process.py
@@ -79,7 +79,7 @@ def get_required_colnames(database, organism):
# Check the config file
def get_config(config_file, database):
# Determine which portal is the database belongs to
- submission_portals = ["NCBI" if x in ["BIOSAMPLE", "SRA", "GENBANK"] else "GISAID" for x in database]
+ submission_portals = ["NCBI" if x in ["BIOSAMPLE", "SRA", "GENBANK"] else "GISAID" if x in ["GISAID"] else "Unknown" for x in database]
# Read in config file
with open(config_file, "r") as f:
config_dict = yaml.load(f, Loader=yaml.BaseLoader) # Load yaml as str only
@@ -362,7 +362,7 @@ def update_submission_status(submission_dir, submission_name, organism, test):
print("Error: Config file for "+submission_name+" does not exist at "+config_file, file=sys.stderr)
sys.exit(1)
else:
- config_dict = get_config(config_file=config_file, database=database_name)
+ config_dict = get_config(config_file=config_file, database=database)
# IF GISAID in a list of submitting databases, check if CLI is downloaded and store in the correct directory
gisaid_cli = os.path.join(submission_dir, "gisaid_cli", organism.lower()+"CLI", organism.lower()+"CLI") if "GISAID" in database_name else None
# Check if the gisaid_cli exists