Skip to content

Commit

Permalink
Added remap mode and removed biopython
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisgKent committed Sep 19, 2024
1 parent 81fc171 commit 473daf9
Show file tree
Hide file tree
Showing 8 changed files with 620 additions and 18 deletions.
84 changes: 69 additions & 15 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions primalbedtools/fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# To keep deps low here is a simple fasta parser

from io import TextIOBase


def read_fasta(fasta_file: str | TextIOBase) -> dict[str, str]:
"""
Read a fasta file and return a dictionary with the sequence name as the key and the sequence as the value.
"""
sequences = {}

if isinstance(fasta_file, str):
handle = open(fasta_file)
else:
handle = fasta_file

with handle as f:
for line in f:
line = line.strip()
if line.startswith(">"):
seq_name = line[1:].split()[0]
if seq_name in sequences:
raise ValueError(f"Duplicate sequence name: {seq_name}")
sequences[seq_name] = []
else:
sequences[seq_name].append(line)

# Avoid str concatenation
for seq_name, seq in sequences.items():
sequences[seq_name] = "".join(seq)

return sequences
54 changes: 54 additions & 0 deletions primalbedtools/main.py
Original file line number Diff line number Diff line change
@@ -1 +1,55 @@
import argparse

from primalbedtools.bedfiles import BedLineParser, sort_bedlines, update_primernames
from primalbedtools.fasta import read_fasta
from primalbedtools.remap import remap


def main():
parser = argparse.ArgumentParser(description="PrimalBedTools")

subparsers = parser.add_subparsers(dest="subparser_name", required=True)

# Remap subcommand
remap_parser = subparsers.add_parser("remap", help="Remap BED file coordinates")
remap_parser.add_argument("--bed", type=str, help="Input BED file", required=True)
remap_parser.add_argument("--msa", type=str, help="Input MSA", required=True)
remap_parser.add_argument(
"--from_id", type=str, help="The ID to remap from", required=True
)
remap_parser.add_argument(
"--to_id", type=str, help="The ID to remap to", required=True
)

# Sort subcommand
sort_parser = subparsers.add_parser("sort", help="Sort BED file")
sort_parser.add_argument("bed", type=str, help="Input BED file")

# Update subcommand
update_parser = subparsers.add_parser(
"update", help="Update BED file with new information"
)
update_parser.add_argument("bed", type=str, help="Input BED file")

args = parser.parse_args()

# Read in the bed file
_headers, bedlines = BedLineParser.from_file(args.bed)

if args.subparser_name == "remap":
msa = read_fasta(args.msa)
bedlines = remap(args.from_id, args.to_id, bedlines, msa)
elif args.subparser_name == "sort":
bedlines = sort_bedlines(bedlines)
elif args.subparser_name == "update":
bedlines = update_primernames(bedlines)
else:
parser.print_help()

bedfile_str = BedLineParser.to_str(_headers, bedlines)
for line in bedfile_str.split("\n"):
print(line)


if __name__ == "__main__":
main()
128 changes: 128 additions & 0 deletions primalbedtools/remap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import numpy as np

from primalbedtools.bedfiles import BedLine


def create_mapping_array(
msa: dict[str, str], from_id: str, to_id: str
) -> tuple[np.ndarray, dict[int, int]]:
"""
Returns a mapping array and a dict of from_genome index to msa index
from_id will be array[0, :] and id_to will be array[1, :]
"""
# Check if IDs are in the MSA
if from_id not in msa:
raise ValueError(f"ID {from_id} not found in ({', '.join(msa.keys())})")
if to_id not in msa:
raise ValueError(f"ID {to_id} not found in ({', '.join(msa.keys())})")

# Check for same names
if from_id == to_id:
raise ValueError("IDs are the same")

# Check for different lengths
if len(msa[from_id]) != len(msa[to_id]):
raise ValueError("MSA lengths are different")

# The +1 is needed to account for edge case with primer at the end, due to non-inclusive slicing
msa_to_genome = np.full([2, len(msa[from_id]) + 1], None) # type: ignore

# populate with from_genome indexes
from_seq = msa[from_id]
from_index = 0
for msa_index in range(len(from_seq)):
if from_seq[msa_index] not in {"", "-"}:
msa_to_genome[0, msa_index] = from_index
from_index += 1
msa_to_genome[0, -1] = from_index

# to genome indexes
to_seq = msa[to_id]
to_index = 0
for msa_index in range(len(to_seq)):
if to_seq[msa_index] not in {"", "-"}:
msa_to_genome[1, msa_index] = to_index
to_index += 1
msa_to_genome[1, -1] = to_index

# Create a dict of primary ref to msa
from_index_to_msa_index = {}
for msa_index, from_index in enumerate(msa_to_genome[0]):
if from_index is not None:
from_index_to_msa_index[from_index] = msa_index

return msa_to_genome, from_index_to_msa_index


def remap(
from_id: str,
to_id: str,
bedlines: list[BedLine],
msa: dict[str, str],
):
msa_to_genome, from_index_to_msa_index = create_mapping_array(msa, from_id, to_id)

for bedline in bedlines:
# Guard for bedlines to other chromosomes
if bedline.chrom != from_id:
continue

msa_start = from_index_to_msa_index[bedline.start]
msa_end = from_index_to_msa_index[bedline.end]

# Check for perfect mapping
if None not in msa_to_genome[:, msa_start:msa_end]:
bedline.start = msa_to_genome[1, msa_start]
bedline.end = msa_to_genome[1, msa_end - 1] + 1
bedline.chrom = to_id
continue

# Check for primer not in the new reference
if np.flatnonzero(msa_to_genome[1, msa_start:msa_end]).size == 0:
print(f"{bedline.primername} not found in new reference")
# revert to original
continue

# Handle non 3' gaps
new_ref_slice = msa_to_genome[1, msa_start:msa_end]

if (
new_ref_slice[-1] if bedline.strand == "+" else new_ref_slice[0]
) is not None:
if bedline.strand == "+":
bedline.end = msa_to_genome[1, msa_end - 1] + 1
bedline.start = max(bedline.end - len(bedline.sequence), 0)
else:
bedline.start = msa_to_genome[1, msa_start]
bedline.end = min(
bedline.start + len(bedline.sequence), len(msa_to_genome[1])
)
bedline.chrom = to_id
continue
else:
print(f"{bedline.primername} 3' gap found in new reference")

# Handle 3' gaps
# At this point at least one base is 'mapped'
# Find the next valid 3' base
if bedline.strand == "+":
for i in range(msa_end, msa_to_genome.shape[1]):
if msa_to_genome[1, i] is not None:
bedline.end = msa_to_genome[1, i] + 1
bedline.start = max(bedline.end - len(bedline.sequence), 0)
bedline.chrom = to_id
break
continue
else:
for i in range(msa_start, -1, -1):
print(i)
if msa_to_genome[1, i] is not None:
bedline.start = msa_to_genome[1, i]
bedline.end = min(
bedline.start + len(bedline.sequence), len(msa_to_genome[1])
)
bedline.chrom = to_id
break
continue

return bedlines
9 changes: 6 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
[tool.poetry]
name = "primalbedtools"
version = "0.3.0"
version = "0.4.0"
description = "A collection of tools for working with primer.bed files"
authors = ["ChrisKent <[email protected]>"]
readme = "README.md"
repository = "https://github.com/ChrisgKent/primal-page"
repository = "https://github.com/ChrisgKent/primalbedtools"
license = "CC BY-SA 4.0"

[tool.poetry.dependencies]
python = "^3.9"

numpy = "2.0.0"

[tool.poetry.group.dev.dependencies]
ruff = "^0.5.5"
Expand All @@ -19,6 +19,9 @@ pre-commit = "^3.7.1"
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.scripts]
primalbedtools = "primalbedtools.main:main"

[tool.ruff.lint]
select = [
# pycodestyle
Expand Down
8 changes: 8 additions & 0 deletions tests/msa.input.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>seq1
ATCGATCGATCATCGATCGATCGTAGCTAGCAYCG
CTAGCTAGCGATCGATCG
CAYTGCAC
CCAACCATGTACCGTCGAGTTA

>seq2
ATCGATCGATCATCGATCGAT
Loading

0 comments on commit 473daf9

Please sign in to comment.