Merge pull request #68 from artic-network/main_unstable

Updated short read handling
artic-network · Sep 23, 2024 · 60bcbd9 · 60bcbd9
2 parents 0215963 + 170bcfa
commit 60bcbd9
Show file tree

Hide file tree

Showing 24 changed files with 1,164 additions and 700 deletions.
diff --git a/bin/assignment.py b/bin/assignment.py
@@ -0,0 +1,97 @@
+import sys
+from collections import defaultdict
+
+
+def trim_read_id(read_id):
+    """
+    Parses the read_id to remove forward/reverse identifier.
+
+    Parameters:
+        read_id (str): A read name.
+
+    Returns:
+        read_id (str): Trimmed read name without forward/reverse identifiers.
+    """
+    if read_id.endswith("/1") or read_id.endswith("/2"):
+        read_id = read_id[:-2]
+
+    return read_id
+
+
+def parse_kraken_assignment_line(line):
+    """
+    Parses the read_id and taxon_id from a line in the kraken assignment file.
+
+    Parameters:
+        line (str): A line from kraken assignment file.
+
+    Returns:
+        taxon_id (str): The NCBI taxon identifier.
+        read_id (str): trimmed read identifier.
+    """
+    line_vals = line.strip().split("\t")
+    if len(line_vals) < 5:
+        return -1, ""
+    if "taxid" in line_vals[2]:
+        temp = line_vals[2].split("taxid ")[-1]
+        taxon_id = temp[:-1]
+    else:
+        taxon_id = line_vals[2]
+
+    read_id = trim_read_id(line_vals[1])
+
+    if taxon_id == "A":
+        taxon_id = "81077"
+    else:
+        taxon_id = taxon_id
+    return taxon_id, read_id
+
+
+class KrakenAssignments:
+    """
+    A class representing a kraken assignment file.
+
+    Attributes:
+        file (str): Name of file to parse.
+    """
+    def __init__(self, assignment_file):
+        """
+        Initializes an KrakenAssignments object.
+        """
+        self.file = assignment_file
+
+    def parse_kraken_assignment_file(self, taxon_id_map, parents=None):
+        """
+        Parses the kraken assignment file and collects the read_ids associated with each of the
+        required taxon ids.
+
+        Parameters:
+            taxon_id_map (dict): A dict from a taxon id to a list of related taxon_ids.
+            parents (dict): A dict mapping taxon id to parent taxon id from NCBI Taxonomy
+
+        Returns:
+            read_map (dict): A dict from read_id to a set of values from the taxon_id_map if the
+                             read_id was classified as the taxon_id.
+        """
+        read_map = defaultdict(set)
+        with open(self.file, "r") as kfile:
+            for line in kfile:
+                taxon_id, read_id = parse_kraken_assignment_line(line)
+                if taxon_id in taxon_id_map:
+                    if read_id in read_map and taxon_id != read_map[read_id]:
+                        del read_map[read_id]
+                    else:
+                        read_map[read_id].add(taxon_id)
+                elif parents:
+                    # handle case where taxon_id has changed
+                    current = taxon_id
+                    while current in parents and current not in taxon_id_map and current != "1":
+                        current = parents[current]
+                        if current in taxon_id_map:
+                            print(f"Add {taxon_id} to {current} list")
+                            if read_id in read_map and current != read_map[read_id]:
+                                del read_map[read_id]
+                            else:
+                                read_map[read_id].add(current)
+        return read_map
+
diff --git a/bin/check_hcid.py b/bin/check_hcid.py
@@ -134,10 +134,10 @@ def check_report_for_hcid(hcid_dict, taxonomy_dir, kreport_file):
             hcid_dict[taxid]["classified_parent_found"] = True
 
 
-def map_to_refs(query, reference):
+def map_to_refs(query, reference, preset):
     counts = defaultdict(int)
     ranges = defaultdict(list)
-    a = mp.Aligner(reference, best_n=1)  # load or build index
+    a = mp.Aligner(reference, best_n=1, preset=preset)  # load or build index
     if not a:
         raise Exception("ERROR: failed to load/build index")
 
@@ -168,8 +168,8 @@ def check_pileup(ref, ref_ranges, reference_file, min_coverage=0):
     return 0
 
 
-def check_ref_coverage(hcid_dict, query, reference):
-    counts, ranges = map_to_refs(query, reference)
+def check_ref_coverage(hcid_dict, query, reference, preset):
+    counts, ranges = map_to_refs(query, reference, preset)
 
     for taxon in hcid_dict:
         taxon_found = True
@@ -288,8 +288,17 @@ def main():
         default="resources/hcid_refs.fa.gz",
         help="Reference FASTA for each HCID",
     )
+    parser.add_argument(
+            "--illumina",
+            action="store_true",
+            required=False,
+            help="Use the short read minimap preset",
+        )
 
     args = parser.parse_args()
+    preset = None
+    if args.illumina:
+        preset = "sr"
 
     # Start Program
     now = datetime.now()
@@ -301,7 +310,7 @@ def main():
     sys.stderr.write("Check kraken report for counts\n")
     check_report_for_hcid(hcid_dict, args.taxonomy, args.kreport_file)
     sys.stderr.write("Check mapped coverage\n")
-    check_ref_coverage(hcid_dict, args.reads, args.ref_fasta)
+    check_ref_coverage(hcid_dict, args.reads, args.ref_fasta, preset)
     sys.stderr.write("Report findings\n")
     report_findings(hcid_dict, args.prefix)
 

diff --git a/bin/concatenate_reads.py b/bin/concatenate_reads.py
@@ -12,12 +12,13 @@
 
 def enforce_headers(f1_header, f2_header):
     if f1_header[0] != "@" or f2_header[0] != "@":
-        # raise Exception("Invalid input FASTQ files.")
         sys.exit(5)
-    if f1_header.strip().split(" ")[0] != f2_header.strip().split(" ")[0]:
-        # raise Exception(
-        #     "Input FASTQ files do not share headers. " "Check and re-run w/o --strict."
-        # )
+
+    if f1_header.split()[0].endswith("/1") and f2_header.split()[0].endswith("/2"):
+        if f1_header.split()[0][:-2] != f2_header.split()[0][:-2]:
+            sys.exit(8)
+
+    elif f1_header.strip().split(" ")[0] != f2_header.strip().split(" ")[0]:
         sys.exit(8)