Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
kkaiser authored May 3, 2020
2 parents 81a3f0a + b316c7e commit 63c8583
Show file tree
Hide file tree
Showing 4 changed files with 408 additions and 3 deletions.
134 changes: 131 additions & 3 deletions flashtext/keyword.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,12 +448,14 @@ def get_all_keywords(self, term_so_far='', current_dict=None):
terms_present[key] = sub_values[key]
return terms_present

def extract_keywords(self, sentence, span_info=False):
def extract_keywords(self, sentence, span_info=False, max_cost=0):
"""Searches in the string for all keywords present in corpus.
Keywords present are added to a list `keywords_extracted` and returned.
Args:
sentence (str): Line of text where we will search for keywords
span_info (bool): True if you need to span the boundaries where the extraction has been performed
max_cost (int): maximum levensthein distance to accept when extracting keywords
Returns:
keywords_extracted (list(str)): List of terms/keywords found in sentence that match our corpus
Expand All @@ -466,7 +468,9 @@ def extract_keywords(self, sentence, span_info=False):
>>> keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
>>> keywords_found
>>> ['New York', 'Bay Area']
>>> keywords_found = keyword_processor.extract_keywords('I love Big Aple and Baay Area.', max_cost=1)
>>> keywords_found
>>> ['New York', 'Bay Area']
"""
keywords_extracted = []
if not sentence:
Expand All @@ -478,6 +482,7 @@ def extract_keywords(self, sentence, span_info=False):
reset_current_dict = False
idx = 0
sentence_len = len(sentence)
curr_cost = max_cost
while idx < sentence_len:
char = sentence[idx]
if not self.case_sensitive:
Expand Down Expand Up @@ -512,6 +517,16 @@ def extract_keywords(self, sentence, span_info=False):
is_longer_seq_found = True
if inner_char in current_dict_continued:
current_dict_continued = current_dict_continued[inner_char]
elif curr_cost > 0:
next_word = self.get_next_word(sentence[idy:])
current_dict_continued, cost, _ = next(
self.levensthein(next_word, max_cost=curr_cost, start_node=current_dict_continued),
({}, 0, 0),
) # current_dict_continued to empty dict by default, so next iteration goes to a `break`
curr_cost -= cost
idy += len(next_word) - 1
if not current_dict_continued:
break
else:
break
idy += 1
Expand All @@ -527,6 +542,7 @@ def extract_keywords(self, sentence, span_info=False):
current_dict = self.keyword_trie_dict
if longest_sequence_found:
keywords_extracted.append((longest_sequence_found, sequence_start_pos, idx))
curr_cost = max_cost
reset_current_dict = True
else:
# we reset current_dict
Expand All @@ -535,6 +551,14 @@ def extract_keywords(self, sentence, span_info=False):
elif char in current_dict:
# we can continue from this char
current_dict = current_dict[char]
elif curr_cost > 0:
next_word = self.get_next_word(sentence[idx:])
current_dict, cost, _ = next(
self.levensthein(next_word, max_cost=curr_cost, start_node=current_dict),
(self.keyword_trie_dict, 0, 0)
)
curr_cost -= cost
idx += len(next_word) - 1
else:
# we reset current_dict
current_dict = self.keyword_trie_dict
Expand Down Expand Up @@ -562,7 +586,7 @@ def extract_keywords(self, sentence, span_info=False):
return keywords_extracted
return [value[0] for value in keywords_extracted]

def replace_keywords(self, sentence):
def replace_keywords(self, sentence, max_cost=0):
"""Searches in the string for all keywords present in corpus.
Keywords present are replaced by the clean name and a new string is returned.
Expand Down Expand Up @@ -593,13 +617,15 @@ def replace_keywords(self, sentence):
sequence_end_pos = 0
idx = 0
sentence_len = len(sentence)
curr_cost = max_cost
while idx < sentence_len:
char = sentence[idx]
if not self.case_sensitive:
char = char.lower()
current_word += orig_sentence[idx]
# when we reach whitespace
if char not in self.non_word_boundaries:
current_word += orig_sentence[idx]
current_white_space = char
# if end is present in current_dict
if self._keyword in current_dict or char in current_dict:
Expand All @@ -623,13 +649,26 @@ def replace_keywords(self, sentence):
inner_char = inner_char.lower()
current_word_continued += orig_sentence[idy]
if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued:
current_word_continued += orig_sentence[idy]
# update longest sequence found
current_white_space = inner_char
longest_sequence_found = current_dict_continued[self._keyword]
sequence_end_pos = idy
is_longer_seq_found = True
if inner_char in current_dict_continued:
current_word_continued += orig_sentence[idy]
current_dict_continued = current_dict_continued[inner_char]
elif curr_cost > 0:
next_word = self.get_next_word(sentence[idy:])
current_dict_continued, cost, _ = next(
self.levensthein(next_word, max_cost=curr_cost, start_node=current_dict_continued),
({}, 0, 0)
)
idy += len(next_word) - 1
curr_cost -= cost
current_word_continued += next_word # just in case of a no match at the end
if not current_dict_continued:
break
else:
break
idy += 1
Expand All @@ -646,6 +685,7 @@ def replace_keywords(self, sentence):
current_word = current_word_continued
current_dict = self.keyword_trie_dict
if longest_sequence_found:
curr_cost = max_cost
new_sentence.append(longest_sequence_found + current_white_space)
current_word = ''
current_white_space = ''
Expand All @@ -661,8 +701,20 @@ def replace_keywords(self, sentence):
current_white_space = ''
elif char in current_dict:
# we can continue from this char
current_word += orig_sentence[idx]
current_dict = current_dict[char]
elif curr_cost > 0:
next_orig_word = self.get_next_word(orig_sentence[idx:])
next_word = next_orig_word if self.case_sensitive else str.lower(next_orig_word)
current_dict, cost, _ = next(
self.levensthein(next_word, max_cost=curr_cost, start_node=current_dict),
(self.keyword_trie_dict, 0, 0)
)
idx += len(next_word) - 1
curr_cost -= cost
current_word += next_orig_word # just in case of a no match at the end
else:
current_word += orig_sentence[idx]
# we reset current_dict
current_dict = self.keyword_trie_dict
# skip to end of word
Expand All @@ -688,3 +740,79 @@ def replace_keywords(self, sentence):
new_sentence.append(current_word)
idx += 1
return "".join(new_sentence)

def get_next_word(self, sentence):
"""
Retrieve the next word in the sequence
Iterate in the string until finding the first char not in non_word_boundaries
Args:
sentence (str): Line of text where we will look for the next word
Returns:
next_word (str): The next word in the sentence
Examples:
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword('Big Apple')
>>> 'Big'
"""
next_word = str()
for char in sentence:
if char not in self.non_word_boundaries:
break
next_word += char
return next_word

def levensthein(self, word, max_cost=2, start_node=None):
"""
Retrieve the nodes where there is a fuzzy match,
via levenshtein distance, and with respect to max_cost
Args:
word (str): word to find a fuzzy match for
max_cost (int): maximum levenshtein distance when performing the fuzzy match
start_node (dict): Trie node from which the search is performed
Yields:
node, cost, depth (tuple): A tuple containing the final node,
the cost (i.e the distance), and the depth in the trie
Examples:
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor(case_sensitive=True)
>>> keyword_processor.add_keyword('Marie', 'Mary')
>>> next(keyword_processor.levensthein('Maria', max_cost=1))
>>> ({'_keyword_': 'Mary'}, 1, 5)
...
>>> keyword_processor = KeywordProcessor(case_sensitive=True
>>> keyword_processor.add_keyword('Marie Blanc', 'Mary')
>>> next(keyword_processor.levensthein('Mari', max_cost=1))
>>> ({' ': {'B': {'l': {'a': {'n': {'c': {'_keyword_': 'Mary'}}}}}}}, 1, 5)
"""
start_node = start_node or self.keyword_trie_dict
rows = range(len(word) + 1)

for char, node in start_node.items():
yield from self._levenshtein_rec(char, node, word, rows, max_cost, depth=1)


def _levenshtein_rec(self, char, node, word, rows, max_cost, depth=0):
n_columns = len(word) + 1
new_rows = [rows[0] + 1]
cost = 0

for col in range(1, n_columns):
insert_cost = new_rows[col - 1] + 1
delete_cost = rows[col] + 1
replace_cost = rows[col - 1] + int(word[col - 1] != char)
cost = min((insert_cost, delete_cost, replace_cost))
new_rows.append(cost)

stop_crit = isinstance(node, dict) and node.keys() & (self._white_space_chars | {self._keyword})
if new_rows[-1] <= max_cost and stop_crit:
yield node, cost, depth

elif isinstance(node, dict) and min(new_rows) <= max_cost:
for new_char, new_node in node.items():
yield from self._levenshtein_rec(new_char, new_node, word, new_rows, max_cost, depth=depth + 1)
Loading

0 comments on commit 63c8583

Please sign in to comment.