Skip to content

Commit

Permalink
Merge pull request #68 from artic-network/main_unstable
Browse files Browse the repository at this point in the history
Updated short read handling
  • Loading branch information
rmcolq authored Sep 23, 2024
2 parents 0215963 + 170bcfa commit 60bcbd9
Show file tree
Hide file tree
Showing 24 changed files with 1,164 additions and 700 deletions.
97 changes: 97 additions & 0 deletions bin/assignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import sys
from collections import defaultdict


def trim_read_id(read_id):
"""
Parses the read_id to remove forward/reverse identifier.
Parameters:
read_id (str): A read name.
Returns:
read_id (str): Trimmed read name without forward/reverse identifiers.
"""
if read_id.endswith("/1") or read_id.endswith("/2"):
read_id = read_id[:-2]

return read_id


def parse_kraken_assignment_line(line):
"""
Parses the read_id and taxon_id from a line in the kraken assignment file.
Parameters:
line (str): A line from kraken assignment file.
Returns:
taxon_id (str): The NCBI taxon identifier.
read_id (str): trimmed read identifier.
"""
line_vals = line.strip().split("\t")
if len(line_vals) < 5:
return -1, ""
if "taxid" in line_vals[2]:
temp = line_vals[2].split("taxid ")[-1]
taxon_id = temp[:-1]
else:
taxon_id = line_vals[2]

read_id = trim_read_id(line_vals[1])

if taxon_id == "A":
taxon_id = "81077"
else:
taxon_id = taxon_id
return taxon_id, read_id


class KrakenAssignments:
"""
A class representing a kraken assignment file.
Attributes:
file (str): Name of file to parse.
"""
def __init__(self, assignment_file):
"""
Initializes an KrakenAssignments object.
"""
self.file = assignment_file

def parse_kraken_assignment_file(self, taxon_id_map, parents=None):
"""
Parses the kraken assignment file and collects the read_ids associated with each of the
required taxon ids.
Parameters:
taxon_id_map (dict): A dict from a taxon id to a list of related taxon_ids.
parents (dict): A dict mapping taxon id to parent taxon id from NCBI Taxonomy
Returns:
read_map (dict): A dict from read_id to a set of values from the taxon_id_map if the
read_id was classified as the taxon_id.
"""
read_map = defaultdict(set)
with open(self.file, "r") as kfile:
for line in kfile:
taxon_id, read_id = parse_kraken_assignment_line(line)
if taxon_id in taxon_id_map:
if read_id in read_map and taxon_id != read_map[read_id]:
del read_map[read_id]
else:
read_map[read_id].add(taxon_id)
elif parents:
# handle case where taxon_id has changed
current = taxon_id
while current in parents and current not in taxon_id_map and current != "1":
current = parents[current]
if current in taxon_id_map:
print(f"Add {taxon_id} to {current} list")
if read_id in read_map and current != read_map[read_id]:
del read_map[read_id]
else:
read_map[read_id].add(current)
return read_map

19 changes: 14 additions & 5 deletions bin/check_hcid.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ def check_report_for_hcid(hcid_dict, taxonomy_dir, kreport_file):
hcid_dict[taxid]["classified_parent_found"] = True


def map_to_refs(query, reference):
def map_to_refs(query, reference, preset):
counts = defaultdict(int)
ranges = defaultdict(list)
a = mp.Aligner(reference, best_n=1) # load or build index
a = mp.Aligner(reference, best_n=1, preset=preset) # load or build index
if not a:
raise Exception("ERROR: failed to load/build index")

Expand Down Expand Up @@ -168,8 +168,8 @@ def check_pileup(ref, ref_ranges, reference_file, min_coverage=0):
return 0


def check_ref_coverage(hcid_dict, query, reference):
counts, ranges = map_to_refs(query, reference)
def check_ref_coverage(hcid_dict, query, reference, preset):
counts, ranges = map_to_refs(query, reference, preset)

for taxon in hcid_dict:
taxon_found = True
Expand Down Expand Up @@ -288,8 +288,17 @@ def main():
default="resources/hcid_refs.fa.gz",
help="Reference FASTA for each HCID",
)
parser.add_argument(
"--illumina",
action="store_true",
required=False,
help="Use the short read minimap preset",
)

args = parser.parse_args()
preset = None
if args.illumina:
preset = "sr"

# Start Program
now = datetime.now()
Expand All @@ -301,7 +310,7 @@ def main():
sys.stderr.write("Check kraken report for counts\n")
check_report_for_hcid(hcid_dict, args.taxonomy, args.kreport_file)
sys.stderr.write("Check mapped coverage\n")
check_ref_coverage(hcid_dict, args.reads, args.ref_fasta)
check_ref_coverage(hcid_dict, args.reads, args.ref_fasta, preset)
sys.stderr.write("Report findings\n")
report_findings(hcid_dict, args.prefix)

Expand Down
11 changes: 6 additions & 5 deletions bin/concatenate_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@

def enforce_headers(f1_header, f2_header):
if f1_header[0] != "@" or f2_header[0] != "@":
# raise Exception("Invalid input FASTQ files.")
sys.exit(5)
if f1_header.strip().split(" ")[0] != f2_header.strip().split(" ")[0]:
# raise Exception(
# "Input FASTQ files do not share headers. " "Check and re-run w/o --strict."
# )

if f1_header.split()[0].endswith("/1") and f2_header.split()[0].endswith("/2"):
if f1_header.split()[0][:-2] != f2_header.split()[0][:-2]:
sys.exit(8)

elif f1_header.strip().split(" ")[0] != f2_header.strip().split(" ")[0]:
sys.exit(8)


Expand Down
Loading

0 comments on commit 60bcbd9

Please sign in to comment.