Added remap mode and removed biopython

ChrisgKent · Sep 19, 2024 · 473daf9 · 473daf9
1 parent 81fc171
commit 473daf9
Show file tree

Hide file tree

Showing 8 changed files with 620 additions and 18 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/primalbedtools/fasta.py b/primalbedtools/fasta.py
@@ -0,0 +1,32 @@
+# To keep deps low here is a simple fasta parser
+
+from io import TextIOBase
+
+
+def read_fasta(fasta_file: str | TextIOBase) -> dict[str, str]:
+    """
+    Read a fasta file and return a dictionary with the sequence name as the key and the sequence as the value.
+    """
+    sequences = {}
+
+    if isinstance(fasta_file, str):
+        handle = open(fasta_file)
+    else:
+        handle = fasta_file
+
+    with handle as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith(">"):
+                seq_name = line[1:].split()[0]
+                if seq_name in sequences:
+                    raise ValueError(f"Duplicate sequence name: {seq_name}")
+                sequences[seq_name] = []
+            else:
+                sequences[seq_name].append(line)
+
+    # Avoid str concatenation
+    for seq_name, seq in sequences.items():
+        sequences[seq_name] = "".join(seq)
+
+    return sequences
diff --git a/primalbedtools/main.py b/primalbedtools/main.py
@@ -1 +1,55 @@
+import argparse
 
+from primalbedtools.bedfiles import BedLineParser, sort_bedlines, update_primernames
+from primalbedtools.fasta import read_fasta
+from primalbedtools.remap import remap
+
+
+def main():
+    parser = argparse.ArgumentParser(description="PrimalBedTools")
+
+    subparsers = parser.add_subparsers(dest="subparser_name", required=True)
+
+    # Remap subcommand
+    remap_parser = subparsers.add_parser("remap", help="Remap BED file coordinates")
+    remap_parser.add_argument("--bed", type=str, help="Input BED file", required=True)
+    remap_parser.add_argument("--msa", type=str, help="Input MSA", required=True)
+    remap_parser.add_argument(
+        "--from_id", type=str, help="The ID to remap from", required=True
+    )
+    remap_parser.add_argument(
+        "--to_id", type=str, help="The ID to remap to", required=True
+    )
+
+    # Sort subcommand
+    sort_parser = subparsers.add_parser("sort", help="Sort BED file")
+    sort_parser.add_argument("bed", type=str, help="Input BED file")
+
+    # Update subcommand
+    update_parser = subparsers.add_parser(
+        "update", help="Update BED file with new information"
+    )
+    update_parser.add_argument("bed", type=str, help="Input BED file")
+
+    args = parser.parse_args()
+
+    # Read in the bed file
+    _headers, bedlines = BedLineParser.from_file(args.bed)
+
+    if args.subparser_name == "remap":
+        msa = read_fasta(args.msa)
+        bedlines = remap(args.from_id, args.to_id, bedlines, msa)
+    elif args.subparser_name == "sort":
+        bedlines = sort_bedlines(bedlines)
+    elif args.subparser_name == "update":
+        bedlines = update_primernames(bedlines)
+    else:
+        parser.print_help()
+
+    bedfile_str = BedLineParser.to_str(_headers, bedlines)
+    for line in bedfile_str.split("\n"):
+        print(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/primalbedtools/remap.py b/primalbedtools/remap.py
@@ -0,0 +1,128 @@
+import numpy as np
+
+from primalbedtools.bedfiles import BedLine
+
+
+def create_mapping_array(
+    msa: dict[str, str], from_id: str, to_id: str
+) -> tuple[np.ndarray, dict[int, int]]:
+    """
+    Returns a mapping array and a dict of from_genome index to msa index
+    from_id will be array[0, :] and id_to will be array[1, :]
+    """
+    # Check if IDs are in the MSA
+    if from_id not in msa:
+        raise ValueError(f"ID {from_id} not found in ({', '.join(msa.keys())})")
+    if to_id not in msa:
+        raise ValueError(f"ID {to_id} not found in ({', '.join(msa.keys())})")
+
+    # Check for same names
+    if from_id == to_id:
+        raise ValueError("IDs are the same")
+
+    # Check for different lengths
+    if len(msa[from_id]) != len(msa[to_id]):
+        raise ValueError("MSA lengths are different")
+
+    # The +1 is needed to account for edge case with primer at the end, due to non-inclusive slicing
+    msa_to_genome = np.full([2, len(msa[from_id]) + 1], None)  # type: ignore
+
+    # populate with from_genome indexes
+    from_seq = msa[from_id]
+    from_index = 0
+    for msa_index in range(len(from_seq)):
+        if from_seq[msa_index] not in {"", "-"}:
+            msa_to_genome[0, msa_index] = from_index
+            from_index += 1
+    msa_to_genome[0, -1] = from_index
+
+    # to genome indexes
+    to_seq = msa[to_id]
+    to_index = 0
+    for msa_index in range(len(to_seq)):
+        if to_seq[msa_index] not in {"", "-"}:
+            msa_to_genome[1, msa_index] = to_index
+            to_index += 1
+    msa_to_genome[1, -1] = to_index
+
+    # Create a dict of primary ref to msa
+    from_index_to_msa_index = {}
+    for msa_index, from_index in enumerate(msa_to_genome[0]):
+        if from_index is not None:
+            from_index_to_msa_index[from_index] = msa_index
+
+    return msa_to_genome, from_index_to_msa_index
+
+
+def remap(
+    from_id: str,
+    to_id: str,
+    bedlines: list[BedLine],
+    msa: dict[str, str],
+):
+    msa_to_genome, from_index_to_msa_index = create_mapping_array(msa, from_id, to_id)
+
+    for bedline in bedlines:
+        # Guard for bedlines to other chromosomes
+        if bedline.chrom != from_id:
+            continue
+
+        msa_start = from_index_to_msa_index[bedline.start]
+        msa_end = from_index_to_msa_index[bedline.end]
+
+        # Check for perfect mapping
+        if None not in msa_to_genome[:, msa_start:msa_end]:
+            bedline.start = msa_to_genome[1, msa_start]
+            bedline.end = msa_to_genome[1, msa_end - 1] + 1
+            bedline.chrom = to_id
+            continue
+
+        # Check for primer not in the new reference
+        if np.flatnonzero(msa_to_genome[1, msa_start:msa_end]).size == 0:
+            print(f"{bedline.primername} not found in new reference")
+            # revert to original
+            continue
+
+        # Handle non 3' gaps
+        new_ref_slice = msa_to_genome[1, msa_start:msa_end]
+
+        if (
+            new_ref_slice[-1] if bedline.strand == "+" else new_ref_slice[0]
+        ) is not None:
+            if bedline.strand == "+":
+                bedline.end = msa_to_genome[1, msa_end - 1] + 1
+                bedline.start = max(bedline.end - len(bedline.sequence), 0)
+            else:
+                bedline.start = msa_to_genome[1, msa_start]
+                bedline.end = min(
+                    bedline.start + len(bedline.sequence), len(msa_to_genome[1])
+                )
+            bedline.chrom = to_id
+            continue
+        else:
+            print(f"{bedline.primername} 3' gap found in new reference")
+
+        # Handle 3' gaps
+        # At this point at least one base is 'mapped'
+        # Find the next valid 3' base
+        if bedline.strand == "+":
+            for i in range(msa_end, msa_to_genome.shape[1]):
+                if msa_to_genome[1, i] is not None:
+                    bedline.end = msa_to_genome[1, i] + 1
+                    bedline.start = max(bedline.end - len(bedline.sequence), 0)
+                    bedline.chrom = to_id
+                    break
+            continue
+        else:
+            for i in range(msa_start, -1, -1):
+                print(i)
+                if msa_to_genome[1, i] is not None:
+                    bedline.start = msa_to_genome[1, i]
+                    bedline.end = min(
+                        bedline.start + len(bedline.sequence), len(msa_to_genome[1])
+                    )
+                    bedline.chrom = to_id
+                    break
+            continue
+
+    return bedlines
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,15 +1,15 @@
 [tool.poetry]
 name = "primalbedtools"
-version = "0.3.0"
+version = "0.4.0"
 description = "A collection of tools for working with primer.bed files"
 authors = ["ChrisKent <[email protected]>"]
 readme = "README.md"
-repository = "https://github.com/ChrisgKent/primal-page"
+repository = "https://github.com/ChrisgKent/primalbedtools"
 license = "CC BY-SA 4.0"
 
 [tool.poetry.dependencies]
 python = "^3.9"
-
+numpy = "2.0.0"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.5.5"
@@ -19,6 +19,9 @@ pre-commit = "^3.7.1"
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
+[tool.poetry.scripts]
+primalbedtools = "primalbedtools.main:main"
+
 [tool.ruff.lint]
 select = [
     # pycodestyle

diff --git a/tests/msa.input.fasta b/tests/msa.input.fasta
@@ -0,0 +1,8 @@
+>seq1
+ATCGATCGATCATCGATCGATCGTAGCTAGCAYCG
+CTAGCTAGCGATCGATCG
+CAYTGCAC
+CCAACCATGTACCGTCGAGTTA
+
+>seq2
+ATCGATCGATCATCGATCGAT