Skip to content

Commit

Permalink
allow duplicated matching to the best match
Browse files Browse the repository at this point in the history
  • Loading branch information
jykr committed Nov 1, 2024
1 parent 54f0026 commit 80c7afc
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 11 deletions.
40 changes: 31 additions & 9 deletions bean/mapping/GuideEditCounter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Tuple, Optional, Sequence, Union
from typing import Tuple, Optional, Sequence, Union, Iterable
import gzip
import logging
from copy import deepcopy
Expand Down Expand Up @@ -30,6 +30,7 @@ def tqdm(iterable, **kwargs):
_write_alignment_matrix,
_write_paired_end_reads,
revcomp,
find_closest_sequence_index
)

logging.basicConfig(
Expand Down Expand Up @@ -126,12 +127,15 @@ def __init__(self, **kwargs):
- kwargs["gstart_reporter"]
- self.screen.guides["guide_len"].max()
)
self.map_duplicated_to_best = kwargs["map_duplicated_to_best"]
self.map_duplicated_hamming_threshold = kwargs["map_duplicated_hamming_threshold"]
self.count_guide_edits = kwargs["count_guide_edits"]
if self.count_guide_edits:
self.screen.uns["guide_edit_counts"] = {}
self.count_reporter_edits = (
kwargs["count_reporter"] or kwargs["count_guide_reporter_alleles"]
)
self.mask_barcode = kwargs["mask_barcode"]
if self.count_reporter_edits:
self.screen.uns["edit_counts"] = {}
self.gstart_reporter = kwargs["gstart_reporter"]
Expand Down Expand Up @@ -584,6 +588,7 @@ def _get_guide_counts_bcmatch_semimatch(
semimatch,
R2_start,
) = self._match_read_to_sgRNA_bcmatch_semimatch(R1_seq, R2_seq)
mapped = False
if len(bc_match) == 0:
if len(semimatch) == 0: # no guide match
if self.keep_intermediate:
Expand All @@ -597,19 +602,26 @@ def _get_guide_counts_bcmatch_semimatch(
r1, r2, outfile_R1_dup_wo_bc, outfile_R2_dup_wo_bc
)
self.duplicate_match_wo_barcode += 1
if self.map_duplicated_to_best:
best_matched_guide_idx = self._find_closest_sequence(R1_seq, R2_seq, semimatch, self.map_duplicated_hamming_threshold)
if best_matched_guide_idx is not None:
self.screen.layers[semimatch_layer][best_matched_guide_idx, 0] += 1
else: # guide match with no barcode match
matched_guide_idx = semimatch[0]
self.screen.layers[semimatch_layer][matched_guide_idx, 0] += 1
if self.count_guide_edits:
guide_allele, _ = self._count_guide_edits(matched_guide_idx, r1)
self.semimatch += 1

elif len(bc_match) >= 2: # duplicate mapping
if self.keep_intermediate:
_write_paired_end_reads(r1, r2, outfile_R1_dup, outfile_R2_dup)
self.duplicate_match += 1

if self.map_duplicated_to_best:
best_matched_guide_idx = self._find_closest_sequence(R1_seq, R2_seq, bc_match, self.map_duplicated_hamming_threshold)
if best_matched_guide_idx is not None:
mapped = True
else: # unique barcode match
mapped = True
if mapped:
matched_guide_idx = bc_match[0]
self.screen.layers[bcmatch_layer][matched_guide_idx, 0] += 1
self.bcmatch += 1
Expand Down Expand Up @@ -732,7 +744,7 @@ def get_barcode(self, R1_seq, R2_seq):
R2_seq[barcode_start_idx : (barcode_start_idx + self.guide_bc_len)]
)

def _match_read_to_sgRNA_bcmatch_semimatch(self, R1_seq: str, R2_seq: str):
def _match_read_to_sgRNA_bcmatch_semimatch(self, R1_seq: str, R2_seq: str) -> Tuple[int, int, int]:
# This should be adjusted for each experimental recipes.
bc_start_idx, guide_barcode = self.get_barcode(R1_seq, R2_seq)
bc_match_idx = np.array([])
Expand All @@ -745,17 +757,27 @@ def _match_read_to_sgRNA_bcmatch_semimatch(self, R1_seq: str, R2_seq: str):
_seq_match = np.where(
self.mask_sequence(seq) == self.screen.guides.masked_sequence
)[0]
_bc_match = np.where(
self.mask_sequence(guide_barcode) == self.screen.guides.masked_barcode
)[0]
if self.mask_barcode:
_bc_match = np.where(
self.mask_sequence(guide_barcode) == self.screen.guides.masked_barcode
)[0]
else:
_bc_match = np.where(
guide_barcode == self.screen.guides.barcode
)[0]

bc_match_idx = np.append(
bc_match_idx, np.intersect1d(_seq_match, _bc_match)
)
semimatch_idx = np.append(semimatch_idx, _seq_match)

return bc_match_idx.astype(int), semimatch_idx.astype(int), bc_start_idx

def _find_closest_sequence(self, R1_seq, R2_seq, ref_guide_indices: Sequence[int], hamming_distance_threshold: int = 0.1,) -> int:
guide_lens = self.screen.guides.iloc[ref_guide_indices].map(len)
query_guide_seqs = [self.get_guide_seq(R1_seq, R2_seq, guide_len) for guide_len in guide_lens]
closest_idx = find_closest_sequence_index(query_guide_seqs, self.screen.guides.sequence.iloc[ref_guide_indices].values.tolist(), hamming_distance_threshold, match_score, mismatch_penalty, allowed_substitution_penalties = {(k, v):0.2 for k, v in self.target_base_edits})
return ref_guide_indices[closest_idx]

def _get_guide_position_seq_of_read(self, seq):
guide_start_idx = self._get_guide_start_idx(seq)
if guide_start_idx == -1:
Expand Down
80 changes: 79 additions & 1 deletion bean/mapping/_supporting_fn.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Union, Dict, Optional, Tuple
from typing import List, Union, Dict, Optional, Tuple, Iterable
import subprocess as sb
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -313,3 +313,81 @@ def _multiindex_dict_to_df(input_dict, key_column_names, value_column_name):
inplace=True,
)
return df


def hamming_distance(
seq1: str,
seq2: str,
match_score: float = 0,
mismatch_penalty: float = 1,
allowed_substitutions_penalties: Dict[Tuple[str, str], float] = {
("A", "G"): 0.2,
("T", "C"): 0.2,
},
):
"""
Calculates the Hamming distance between two DNA sequences with different penalties.
Args:
seq1 (str): The first DNA sequence.
seq2 (str): The second DNA sequence.
match_score (int): Score for a match (default: 0).
mismatch_penalty (int): Penalty for a mismatch (default: 1).
Returns:
int: The Hamming distance.
"""

if len(seq1) != len(seq2):
raise ValueError("Sequences must be of equal length.")

distance = 0
for i in range(len(seq1)):
if seq1[i] == seq2[i]:
distance += match_score
elif (seq1[i], seq2[i]) in allowed_substitutions_penalties:
distance += allowed_substitutions_penalties[(seq1[i], seq2[i])]
else:
distance += mismatch_penalty
return distance


def find_closest_sequence_index(
query_seqs: Iterable[str],
ref_seqs: Iterable[str],
hamming_distance_threshold: int = 0.1,
match_score: float = 0,
mismatch_penalty: float = 1,
allowed_substitutions_penalties: Dict[Tuple[str, str], float] = {
("A", "G"): 0.2,
("T", "C"): 0.2,
},
) -> int:
"""
Find the closest sequence to a query sequence.
Args:
query_seq (str): The query sequence.
ref_seqs (Iterable[str]): The reference sequences.
hamming_distance_threshold (int, optional): The maximum allowed hamming distance. Defaults to 3.
match_score (int, optional): Score for a match (default: 0). Defaults to 0.
mismatch_penalty (int, optional): Penalty for a mismatch (default: 1). Defaults to 1.
allowed_substitutions_penalties (Dict[Tuple[str, str], float], optional): Allowed substitutions and penalties. Defaults to [("A", "G"):0.2, ("T", "C"):0.2].
Returns:
int: The closest sequence pair's index in the input ref_seqs list.
"""
min_distance = float("inf")
closest_seq_index = None
for i, (query_seq, ref_seq) in enumerate(zip(query_seqs, ref_seqs)):
distance = hamming_distance(
query_seq,
ref_seq,
match_score,
mismatch_penalty,
allowed_substitutions_penalties,
) / len(query_seq)
if distance < min_distance and distance <= hamming_distance_threshold:
min_distance = distance
closest_seq_index = i
return closest_seq_index
16 changes: 16 additions & 0 deletions bean/mapping/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,22 @@ def _get_input_parser(parser=None):
help="count the matched allele of guide and reporter edit",
action="store_true",
)
parser.add_argument(
"--map-duplicated-to-best",
help="When found duplicated mapping allowing for intended edits, map them to the best-matching reads",
action="store_true",
)
parser.add_argument(
"--map-duplicated-hamming-threshold",
help="When found duplicated mapping allowing for intended edits, map them to the best-matching reads only when hamming distance is less than 5*value*guide_length for intended edit",
type=float,
default=0.1,
)
parser.add_argument(
"--mask-barcode",
help="Allow intended base edit in the barcode sequence.",
action="store_true",
)
parser.add_argument(
"--tiling",
help="Specify that the guide library is tiling library without 'n guides per target' design",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_count_samples_dual():

@pytest.mark.order(107)
def test_count_samples_bcstart():
cmd = "bean count-samples -i tests/data/sample_list.csv -b A -f tests/data/test_guide_info.csv -o tests/test_res/var2/ -r --barcode-start-seq=GGAA"
cmd = "bean count-samples -i tests/data/sample_list.csv -b A -f tests/data/test_guide_info.csv -o tests/test_res/var2/ -r --barcode-start-seq=GGAA --map-duplicated-to-best"
try:
subprocess.check_output(
cmd,
Expand Down

0 comments on commit 80c7afc

Please sign in to comment.