Skip to content

Commit

Permalink
Merge pull request #5 from vistalab-technion/seq-align-missing-residu…
Browse files Browse the repository at this point in the history
…es-fix

Better handling for un-modelled residues
  • Loading branch information
avivrosenberg authored Jun 13, 2024
2 parents 9e9d018 + 7ec2019 commit abb854e
Show file tree
Hide file tree
Showing 5 changed files with 266 additions and 134 deletions.
4 changes: 2 additions & 2 deletions notebooks/pp5_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,8 @@
],
"source": [
"# Sequence of AAs as a SeqRecord\n",
"seq_rec = prec.protein_seq\n",
"print(f\"{seq_rec.seq!s}, len={len(seq_rec)}\")"
"aa_seq = prec.aa_seq\n",
"print(f\"{aa_seq}, len={len(aa_seq)}\")"
]
},
{
Expand Down
74 changes: 72 additions & 2 deletions src/pp5/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import warnings
import contextlib
import subprocess
from typing import Any, Dict, Tuple, Union, Iterable, Optional
from typing import Any, Dict, List, Tuple, Union, Iterable, Optional
from pathlib import Path
from datetime import datetime, timedelta

Expand All @@ -21,11 +21,12 @@
from tqdm import tqdm
from Bio.Seq import Seq

from pp5.codons import ACIDS_1TO3, UNKNOWN_AA
from pp5.external_dbs.pdb import PDB_RCSB

with warnings.catch_warnings():
warnings.simplefilter("ignore", BiopythonExperimentalWarning)
from Bio.Align import substitution_matrices
from Bio.Align import substitution_matrices, PairwiseAligner, Alignment

from Bio.AlignIO import MultipleSeqAlignment as MSA
from Bio.SeqRecord import SeqRecord
Expand Down Expand Up @@ -57,6 +58,75 @@
BLOSUM90 = substitution_matrices.load("BLOSUM90")


def pairwise_alignment_map(
src_seq: str,
tgt_seq: str,
open_gap_score: float = -10,
extend_gap_score: float = -0.5,
) -> Tuple[Alignment, Dict[int, int]]:
"""
Aligns between two AA sequences and produces a map from the indices of
the source sequence to the indices of the target sequence.
Uses biopython's PairwiseAligner with BLOSUM80 matrix.
In case there is more than one alignment option, the one with the highest score
and smallest number of gaps will be chosen.
:param src_seq: Source AA sequence to align.
:param tgt_seq: Target AA sequence to align.
:param open_gap_score: Penalty for opening a gap in the alignment.
:param extend_gap_score: Penalty for extending a gap by one residue.
:return: A tuple with two elements:
- The alignment object produced by the aligner.
- A dict mapping from an index in the source sequence to the corresponding
index in the target sequence.
"""
aligner = PairwiseAligner(
substitution_matrix=BLOSUM80,
open_gap_score=open_gap_score,
extend_gap_score=extend_gap_score,
)

# In rare cases, there could be unknown letters in the sequences. This causes
# the alignment to break. Replace with "X" which the aligner can handle.
unknown_aas = set(src_seq).union(set(tgt_seq)) - set(ACIDS_1TO3)
for unk_aa in unknown_aas: # usually there are none
tgt_seq = tgt_seq.replace(unk_aa, UNKNOWN_AA)
src_seq = src_seq.replace(unk_aa, UNKNOWN_AA)

# The aligner returns multiple alignment options
multi_alignments = aligner.align(src_seq, tgt_seq)

# Choose alignment with maximal score and minimum number of gaps
def _align_sort_key(a: Alignment) -> Tuple[int, int]:
_n_gaps = a.coordinates.shape[1]
return -a.score, _n_gaps

alignment = sorted(multi_alignments, key=_align_sort_key)[0]

# Alignment contains two tuples each of length N (for N matching sub-sequences)
# (
# ((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)),
# ((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN))
# )
src_to_tgt: List[Tuple[int, int]] = []
src_subseqs, tgt_subseqs = alignment.aligned
assert len(src_subseqs) == len(tgt_subseqs)
for i in range(len(src_subseqs)):
src_start, src_end = src_subseqs[i]
tgt_start, tgt_end = tgt_subseqs[i]
assert src_end - src_start == tgt_end - tgt_start

for j in range(src_end - src_start):
if src_seq[src_start + j] != tgt_seq[tgt_start + j]:
# There are mismatches included in the match sequence (cases
# where a similar AA is not considered a complete mismatch).
# We are stricter: require exact match.
continue
src_to_tgt.append((src_start + j, tgt_start + j))

return alignment, dict(src_to_tgt)


def multiseq_align(
seqs: Iterable[SeqRecord] = None, in_file=None, out_file=None, **clustal_kw
) -> MSA:
Expand Down
7 changes: 7 additions & 0 deletions src/pp5/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
COL_PDB_SOURCE = "pdb_source"
COL_REJECTED_BY = "rejected_by"
COL_NUM_ALTLOCS = "num_altlocs"
COL_NUM_UNMODELLED = "num_unmodelled"

COLLECTION_METADATA_FILENAME = "meta.json"
ALL_STRUCTS_FILENAME = "meta-structs_all"
Expand Down Expand Up @@ -1145,6 +1146,12 @@ def _collect_single_structure(
COL_SEQ_LEN: seq_len,
COL_SEQ_GAPS: str.join(";", [f"{s}-{e}" for (s, e) in prec.seq_gaps]),
COL_NUM_ALTLOCS: prec.num_altlocs,
**{
f"{COL_NUM_UNMODELLED}_{suffix}": n
for n, suffix in zip(
prec.num_unmodelled, ["nterm", "inter", "cterm"]
)
},
COL_PDB_SOURCE: pdb_source,
**meta.as_dict(chain_id=chain_id, seq_to_str=True),
}
Expand Down
Loading

0 comments on commit abb854e

Please sign in to comment.