pairwise_alignment_map: move logic out of prec and into pp5.align

vistalab-technion · Jun 2, 2024 · 7f5c4f5 · 7f5c4f5
1 parent 9e9d018
commit 7f5c4f5
Showing 1 changed file with 65 additions and 2 deletions.
diff --git a/src/pp5/align.py b/src/pp5/align.py
@@ -12,7 +12,7 @@
 import warnings
 import contextlib
 import subprocess
-from typing import Any, Dict, Tuple, Union, Iterable, Optional
+from typing import Any, Dict, List, Tuple, Union, Iterable, Optional
 from pathlib import Path
 from datetime import datetime, timedelta
 
@@ -21,11 +21,12 @@
 from tqdm import tqdm
 from Bio.Seq import Seq
 
+from pp5.codons import ACIDS_1TO3, UNKNOWN_AA
 from pp5.external_dbs.pdb import PDB_RCSB
 
 with warnings.catch_warnings():
     warnings.simplefilter("ignore", BiopythonExperimentalWarning)
-    from Bio.Align import substitution_matrices
+    from Bio.Align import substitution_matrices, PairwiseAligner
 
 from Bio.AlignIO import MultipleSeqAlignment as MSA
 from Bio.SeqRecord import SeqRecord
@@ -57,6 +58,68 @@
 BLOSUM90 = substitution_matrices.load("BLOSUM90")
 
 
+def pairwise_alignment_map(
+    src_seq: str,
+    tgt_seq: str,
+    open_gap_score: float = -10,
+    extend_gap_score: float = -0.5,
+) -> Tuple[float, Dict[int, int]]:
+    """
+    Aligns between two AA sequences and produces a map from the indices of
+    the source sequence to the indices of the target sequence.
+    Uses biopython's PairwiseAligner with BLOSUM80 matrix.
+
+    :param src_seq: Source AA sequence to align.
+    :param tgt_seq: Target AA sequence to align.
+    :param open_gap_score: Penalty for opening a gap in the alignment.
+    :param extend_gap_score: Penalty for extending a gap by one residue.
+    :return: A tuple with two elements:
+    -  The alignment score, higher is better.
+    -  A dict mapping from an index in the source sequence to the corresponding
+       index in the target sequence.
+    """
+    aligner = PairwiseAligner(
+        substitution_matrix=BLOSUM80,
+        open_gap_score=open_gap_score,
+        extend_gap_score=extend_gap_score,
+    )
+
+    # In rare cases, there could be unknown letters in the sequences. This causes
+    # the alignment to break. Replace with "X" which the aligner can handle.
+    unknown_aas = set(src_seq).union(set(tgt_seq)) - set(ACIDS_1TO3)
+    for unk_aa in unknown_aas:  # usually there are none
+        tgt_seq = tgt_seq.replace(unk_aa, UNKNOWN_AA)
+        src_seq = src_seq.replace(unk_aa, UNKNOWN_AA)
+
+    multi_alignments = aligner.align(src_seq, tgt_seq)
+    alignment = sorted(multi_alignments, key=lambda a: a.score)[-1]
+    score = alignment.score
+    # LOGGER.info(f"{self}: PDB to UNP sequence alignment score={alignment.score}")
+
+    # Alignment contains two tuples each of length N (for N matching sub-sequences)
+    # (
+    #   ((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)),
+    #   ((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN))
+    # )
+    src_to_tgt: List[Tuple[int, int]] = []
+    src_subseqs, tgt_subseqs = alignment.aligned
+    assert len(src_subseqs) == len(tgt_subseqs)
+    for i in range(len(src_subseqs)):
+        src_start, src_end = src_subseqs[i]
+        tgt_start, tgt_end = tgt_subseqs[i]
+        assert src_end - src_start == tgt_end - tgt_start
+
+        for j in range(src_end - src_start):
+            if src_seq[src_start + j] != tgt_seq[tgt_start + j]:
+                # There are mismatches included in the match sequence (cases
+                # where a similar AA is not considered a complete mismatch).
+                # We are stricter: require exact match.
+                continue
+            src_to_tgt.append((src_start + j, tgt_start + j))
+
+    return score, dict(src_to_tgt)
+
+
 def multiseq_align(
     seqs: Iterable[SeqRecord] = None, in_file=None, out_file=None, **clustal_kw
 ) -> MSA: