From 9e06d37ff02edabbd0e8b85b2ee511dff7fadabe Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 2 Jun 2024 19:15:35 +0300 Subject: [PATCH] pairwise_alignment_map: Update to choose alignment with smallest number of gaps --- src/pp5/align.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/pp5/align.py b/src/pp5/align.py index d27e649..4ea659d 100644 --- a/src/pp5/align.py +++ b/src/pp5/align.py @@ -68,6 +68,8 @@ def pairwise_alignment_map( Aligns between two AA sequences and produces a map from the indices of the source sequence to the indices of the target sequence. Uses biopython's PairwiseAligner with BLOSUM80 matrix. + In case there is more than one alignment option, the one with the highest score + and smallest number of gaps will be chosen. :param src_seq: Source AA sequence to align. :param tgt_seq: Target AA sequence to align. @@ -91,8 +93,15 @@ def pairwise_alignment_map( tgt_seq = tgt_seq.replace(unk_aa, UNKNOWN_AA) src_seq = src_seq.replace(unk_aa, UNKNOWN_AA) + # The aligner returns multiple alignment options multi_alignments = aligner.align(src_seq, tgt_seq) - alignment = sorted(multi_alignments, key=lambda a: a.score)[-1] + + # Choose alignment with maximal score and minimum number of gaps + def _align_sort_key(a: Alignment) -> Tuple[int, int]: + _n_gaps = a.coordinates.shape[1] + return -a.score, _n_gaps + + alignment = sorted(multi_alignments, key=_align_sort_key)[0] # Alignment contains two tuples each of length N (for N matching sub-sequences) # (