-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added remap mode and removed biopython
- Loading branch information
1 parent
81fc171
commit 473daf9
Showing
8 changed files
with
620 additions
and
18 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# To keep deps low here is a simple fasta parser | ||
|
||
from io import TextIOBase | ||
|
||
|
||
def read_fasta(fasta_file: str | TextIOBase) -> dict[str, str]: | ||
""" | ||
Read a fasta file and return a dictionary with the sequence name as the key and the sequence as the value. | ||
""" | ||
sequences = {} | ||
|
||
if isinstance(fasta_file, str): | ||
handle = open(fasta_file) | ||
else: | ||
handle = fasta_file | ||
|
||
with handle as f: | ||
for line in f: | ||
line = line.strip() | ||
if line.startswith(">"): | ||
seq_name = line[1:].split()[0] | ||
if seq_name in sequences: | ||
raise ValueError(f"Duplicate sequence name: {seq_name}") | ||
sequences[seq_name] = [] | ||
else: | ||
sequences[seq_name].append(line) | ||
|
||
# Avoid str concatenation | ||
for seq_name, seq in sequences.items(): | ||
sequences[seq_name] = "".join(seq) | ||
|
||
return sequences |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,55 @@ | ||
import argparse | ||
|
||
from primalbedtools.bedfiles import BedLineParser, sort_bedlines, update_primernames | ||
from primalbedtools.fasta import read_fasta | ||
from primalbedtools.remap import remap | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="PrimalBedTools") | ||
|
||
subparsers = parser.add_subparsers(dest="subparser_name", required=True) | ||
|
||
# Remap subcommand | ||
remap_parser = subparsers.add_parser("remap", help="Remap BED file coordinates") | ||
remap_parser.add_argument("--bed", type=str, help="Input BED file", required=True) | ||
remap_parser.add_argument("--msa", type=str, help="Input MSA", required=True) | ||
remap_parser.add_argument( | ||
"--from_id", type=str, help="The ID to remap from", required=True | ||
) | ||
remap_parser.add_argument( | ||
"--to_id", type=str, help="The ID to remap to", required=True | ||
) | ||
|
||
# Sort subcommand | ||
sort_parser = subparsers.add_parser("sort", help="Sort BED file") | ||
sort_parser.add_argument("bed", type=str, help="Input BED file") | ||
|
||
# Update subcommand | ||
update_parser = subparsers.add_parser( | ||
"update", help="Update BED file with new information" | ||
) | ||
update_parser.add_argument("bed", type=str, help="Input BED file") | ||
|
||
args = parser.parse_args() | ||
|
||
# Read in the bed file | ||
_headers, bedlines = BedLineParser.from_file(args.bed) | ||
|
||
if args.subparser_name == "remap": | ||
msa = read_fasta(args.msa) | ||
bedlines = remap(args.from_id, args.to_id, bedlines, msa) | ||
elif args.subparser_name == "sort": | ||
bedlines = sort_bedlines(bedlines) | ||
elif args.subparser_name == "update": | ||
bedlines = update_primernames(bedlines) | ||
else: | ||
parser.print_help() | ||
|
||
bedfile_str = BedLineParser.to_str(_headers, bedlines) | ||
for line in bedfile_str.split("\n"): | ||
print(line) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import numpy as np | ||
|
||
from primalbedtools.bedfiles import BedLine | ||
|
||
|
||
def create_mapping_array( | ||
msa: dict[str, str], from_id: str, to_id: str | ||
) -> tuple[np.ndarray, dict[int, int]]: | ||
""" | ||
Returns a mapping array and a dict of from_genome index to msa index | ||
from_id will be array[0, :] and id_to will be array[1, :] | ||
""" | ||
# Check if IDs are in the MSA | ||
if from_id not in msa: | ||
raise ValueError(f"ID {from_id} not found in ({', '.join(msa.keys())})") | ||
if to_id not in msa: | ||
raise ValueError(f"ID {to_id} not found in ({', '.join(msa.keys())})") | ||
|
||
# Check for same names | ||
if from_id == to_id: | ||
raise ValueError("IDs are the same") | ||
|
||
# Check for different lengths | ||
if len(msa[from_id]) != len(msa[to_id]): | ||
raise ValueError("MSA lengths are different") | ||
|
||
# The +1 is needed to account for edge case with primer at the end, due to non-inclusive slicing | ||
msa_to_genome = np.full([2, len(msa[from_id]) + 1], None) # type: ignore | ||
|
||
# populate with from_genome indexes | ||
from_seq = msa[from_id] | ||
from_index = 0 | ||
for msa_index in range(len(from_seq)): | ||
if from_seq[msa_index] not in {"", "-"}: | ||
msa_to_genome[0, msa_index] = from_index | ||
from_index += 1 | ||
msa_to_genome[0, -1] = from_index | ||
|
||
# to genome indexes | ||
to_seq = msa[to_id] | ||
to_index = 0 | ||
for msa_index in range(len(to_seq)): | ||
if to_seq[msa_index] not in {"", "-"}: | ||
msa_to_genome[1, msa_index] = to_index | ||
to_index += 1 | ||
msa_to_genome[1, -1] = to_index | ||
|
||
# Create a dict of primary ref to msa | ||
from_index_to_msa_index = {} | ||
for msa_index, from_index in enumerate(msa_to_genome[0]): | ||
if from_index is not None: | ||
from_index_to_msa_index[from_index] = msa_index | ||
|
||
return msa_to_genome, from_index_to_msa_index | ||
|
||
|
||
def remap( | ||
from_id: str, | ||
to_id: str, | ||
bedlines: list[BedLine], | ||
msa: dict[str, str], | ||
): | ||
msa_to_genome, from_index_to_msa_index = create_mapping_array(msa, from_id, to_id) | ||
|
||
for bedline in bedlines: | ||
# Guard for bedlines to other chromosomes | ||
if bedline.chrom != from_id: | ||
continue | ||
|
||
msa_start = from_index_to_msa_index[bedline.start] | ||
msa_end = from_index_to_msa_index[bedline.end] | ||
|
||
# Check for perfect mapping | ||
if None not in msa_to_genome[:, msa_start:msa_end]: | ||
bedline.start = msa_to_genome[1, msa_start] | ||
bedline.end = msa_to_genome[1, msa_end - 1] + 1 | ||
bedline.chrom = to_id | ||
continue | ||
|
||
# Check for primer not in the new reference | ||
if np.flatnonzero(msa_to_genome[1, msa_start:msa_end]).size == 0: | ||
print(f"{bedline.primername} not found in new reference") | ||
# revert to original | ||
continue | ||
|
||
# Handle non 3' gaps | ||
new_ref_slice = msa_to_genome[1, msa_start:msa_end] | ||
|
||
if ( | ||
new_ref_slice[-1] if bedline.strand == "+" else new_ref_slice[0] | ||
) is not None: | ||
if bedline.strand == "+": | ||
bedline.end = msa_to_genome[1, msa_end - 1] + 1 | ||
bedline.start = max(bedline.end - len(bedline.sequence), 0) | ||
else: | ||
bedline.start = msa_to_genome[1, msa_start] | ||
bedline.end = min( | ||
bedline.start + len(bedline.sequence), len(msa_to_genome[1]) | ||
) | ||
bedline.chrom = to_id | ||
continue | ||
else: | ||
print(f"{bedline.primername} 3' gap found in new reference") | ||
|
||
# Handle 3' gaps | ||
# At this point at least one base is 'mapped' | ||
# Find the next valid 3' base | ||
if bedline.strand == "+": | ||
for i in range(msa_end, msa_to_genome.shape[1]): | ||
if msa_to_genome[1, i] is not None: | ||
bedline.end = msa_to_genome[1, i] + 1 | ||
bedline.start = max(bedline.end - len(bedline.sequence), 0) | ||
bedline.chrom = to_id | ||
break | ||
continue | ||
else: | ||
for i in range(msa_start, -1, -1): | ||
print(i) | ||
if msa_to_genome[1, i] is not None: | ||
bedline.start = msa_to_genome[1, i] | ||
bedline.end = min( | ||
bedline.start + len(bedline.sequence), len(msa_to_genome[1]) | ||
) | ||
bedline.chrom = to_id | ||
break | ||
continue | ||
|
||
return bedlines |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,15 @@ | ||
[tool.poetry] | ||
name = "primalbedtools" | ||
version = "0.3.0" | ||
version = "0.4.0" | ||
description = "A collection of tools for working with primer.bed files" | ||
authors = ["ChrisKent <[email protected]>"] | ||
readme = "README.md" | ||
repository = "https://github.com/ChrisgKent/primal-page" | ||
repository = "https://github.com/ChrisgKent/primalbedtools" | ||
license = "CC BY-SA 4.0" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.9" | ||
|
||
numpy = "2.0.0" | ||
|
||
[tool.poetry.group.dev.dependencies] | ||
ruff = "^0.5.5" | ||
|
@@ -19,6 +19,9 @@ pre-commit = "^3.7.1" | |
requires = ["poetry-core"] | ||
build-backend = "poetry.core.masonry.api" | ||
|
||
[tool.poetry.scripts] | ||
primalbedtools = "primalbedtools.main:main" | ||
|
||
[tool.ruff.lint] | ||
select = [ | ||
# pycodestyle | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
>seq1 | ||
ATCGATCGATCATCGATCGATCGTAGCTAGCAYCG | ||
CTAGCTAGCGATCGATCG | ||
CAYTGCAC | ||
CCAACCATGTACCGTCGAGTTA | ||
|
||
>seq2 | ||
ATCGATCGATCATCGATCGAT |
Oops, something went wrong.