Merge pull request #5 from vistalab-technion/seq-align-missing-residu…

…es-fix Better handling for un-modelled residues
vistalab-technion · Jun 13, 2024 · abb854e · abb854e
2 parents 9e9d018 + 7ec2019
commit abb854e
Show file tree

Hide file tree

Showing 5 changed files with 266 additions and 134 deletions.
diff --git a/notebooks/pp5_demo.ipynb b/notebooks/pp5_demo.ipynb
@@ -186,8 +186,8 @@
    ],
    "source": [
     "# Sequence of AAs as a SeqRecord\n",
-    "seq_rec = prec.protein_seq\n",
-    "print(f\"{seq_rec.seq!s}, len={len(seq_rec)}\")"
+    "aa_seq = prec.aa_seq\n",
+    "print(f\"{aa_seq}, len={len(aa_seq)}\")"
    ]
   },
   {

diff --git a/src/pp5/align.py b/src/pp5/align.py
@@ -12,7 +12,7 @@
 import warnings
 import contextlib
 import subprocess
-from typing import Any, Dict, Tuple, Union, Iterable, Optional
+from typing import Any, Dict, List, Tuple, Union, Iterable, Optional
 from pathlib import Path
 from datetime import datetime, timedelta
 
@@ -21,11 +21,12 @@
 from tqdm import tqdm
 from Bio.Seq import Seq
 
+from pp5.codons import ACIDS_1TO3, UNKNOWN_AA
 from pp5.external_dbs.pdb import PDB_RCSB
 
 with warnings.catch_warnings():
     warnings.simplefilter("ignore", BiopythonExperimentalWarning)
-    from Bio.Align import substitution_matrices
+    from Bio.Align import substitution_matrices, PairwiseAligner, Alignment
 
 from Bio.AlignIO import MultipleSeqAlignment as MSA
 from Bio.SeqRecord import SeqRecord
@@ -57,6 +58,75 @@
 BLOSUM90 = substitution_matrices.load("BLOSUM90")
 
 
+def pairwise_alignment_map(
+    src_seq: str,
+    tgt_seq: str,
+    open_gap_score: float = -10,
+    extend_gap_score: float = -0.5,
+) -> Tuple[Alignment, Dict[int, int]]:
+    """
+    Aligns between two AA sequences and produces a map from the indices of
+    the source sequence to the indices of the target sequence.
+    Uses biopython's PairwiseAligner with BLOSUM80 matrix.
+    In case there is more than one alignment option, the one with the highest score
+    and smallest number of gaps will be chosen.
+
+    :param src_seq: Source AA sequence to align.
+    :param tgt_seq: Target AA sequence to align.
+    :param open_gap_score: Penalty for opening a gap in the alignment.
+    :param extend_gap_score: Penalty for extending a gap by one residue.
+    :return: A tuple with two elements:
+    -  The alignment object produced by the aligner.
+    -  A dict mapping from an index in the source sequence to the corresponding
+       index in the target sequence.
+    """
+    aligner = PairwiseAligner(
+        substitution_matrix=BLOSUM80,
+        open_gap_score=open_gap_score,
+        extend_gap_score=extend_gap_score,
+    )
+
+    # In rare cases, there could be unknown letters in the sequences. This causes
+    # the alignment to break. Replace with "X" which the aligner can handle.
+    unknown_aas = set(src_seq).union(set(tgt_seq)) - set(ACIDS_1TO3)
+    for unk_aa in unknown_aas:  # usually there are none
+        tgt_seq = tgt_seq.replace(unk_aa, UNKNOWN_AA)
+        src_seq = src_seq.replace(unk_aa, UNKNOWN_AA)
+
+    # The aligner returns multiple alignment options
+    multi_alignments = aligner.align(src_seq, tgt_seq)
+
+    # Choose alignment with maximal score and minimum number of gaps
+    def _align_sort_key(a: Alignment) -> Tuple[int, int]:
+        _n_gaps = a.coordinates.shape[1]
+        return -a.score, _n_gaps
+
+    alignment = sorted(multi_alignments, key=_align_sort_key)[0]
+
+    # Alignment contains two tuples each of length N (for N matching sub-sequences)
+    # (
+    #   ((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)),
+    #   ((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN))
+    # )
+    src_to_tgt: List[Tuple[int, int]] = []
+    src_subseqs, tgt_subseqs = alignment.aligned
+    assert len(src_subseqs) == len(tgt_subseqs)
+    for i in range(len(src_subseqs)):
+        src_start, src_end = src_subseqs[i]
+        tgt_start, tgt_end = tgt_subseqs[i]
+        assert src_end - src_start == tgt_end - tgt_start
+
+        for j in range(src_end - src_start):
+            if src_seq[src_start + j] != tgt_seq[tgt_start + j]:
+                # There are mismatches included in the match sequence (cases
+                # where a similar AA is not considered a complete mismatch).
+                # We are stricter: require exact match.
+                continue
+            src_to_tgt.append((src_start + j, tgt_start + j))
+
+    return alignment, dict(src_to_tgt)
+
+
 def multiseq_align(
     seqs: Iterable[SeqRecord] = None, in_file=None, out_file=None, **clustal_kw
 ) -> MSA:

diff --git a/src/pp5/collect.py b/src/pp5/collect.py
@@ -58,6 +58,7 @@
 COL_PDB_SOURCE = "pdb_source"
 COL_REJECTED_BY = "rejected_by"
 COL_NUM_ALTLOCS = "num_altlocs"
+COL_NUM_UNMODELLED = "num_unmodelled"
 
 COLLECTION_METADATA_FILENAME = "meta.json"
 ALL_STRUCTS_FILENAME = "meta-structs_all"
@@ -1145,6 +1146,12 @@ def _collect_single_structure(
                 COL_SEQ_LEN: seq_len,
                 COL_SEQ_GAPS: str.join(";", [f"{s}-{e}" for (s, e) in prec.seq_gaps]),
                 COL_NUM_ALTLOCS: prec.num_altlocs,
+                **{
+                    f"{COL_NUM_UNMODELLED}_{suffix}": n
+                    for n, suffix in zip(
+                        prec.num_unmodelled, ["nterm", "inter", "cterm"]
+                    )
+                },
                 COL_PDB_SOURCE: pdb_source,
                 **meta.as_dict(chain_id=chain_id, seq_to_str=True),
             }