diff --git a/flashtext/keyword.py b/flashtext/keyword.py index b5cb1d1..7e995e7 100644 --- a/flashtext/keyword.py +++ b/flashtext/keyword.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import os import string import io @@ -80,11 +81,11 @@ def __contains__(self, word): >>> # True """ - if not self.case_sensitive: - word = word.lower() current_dict = self.keyword_trie_dict len_covered = 0 for char in word: + if not self.case_sensitive: + char = char.lower() if char in current_dict: current_dict = current_dict[char] len_covered += 1 @@ -108,11 +109,11 @@ def __getitem__(self, word): >>> keyword_processor['Big Apple'] >>> # New York """ - if not self.case_sensitive: - word = word.lower() current_dict = self.keyword_trie_dict len_covered = 0 for char in word: + if not self.case_sensitive: + char = char.lower() if char in current_dict: current_dict = current_dict[char] len_covered += 1 @@ -141,10 +142,10 @@ def __setitem__(self, keyword, clean_name=None): clean_name = keyword if keyword and clean_name: - if not self.case_sensitive: - keyword = keyword.lower() current_dict = self.keyword_trie_dict for letter in keyword: + if not self.case_sensitive: + letter = letter.lower() current_dict = current_dict.setdefault(letter, {}) if self._keyword not in current_dict: status = True @@ -166,11 +167,11 @@ def __delitem__(self, keyword): """ status = False if keyword: - if not self.case_sensitive: - keyword = keyword.lower() current_dict = self.keyword_trie_dict character_trie_list = [] for letter in keyword: + if not self.case_sensitive: + letter = letter.lower() if letter in current_dict: character_trie_list.append((letter, current_dict)) current_dict = current_dict[letter] @@ -475,8 +476,6 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0): if not sentence: # if sentence is empty or none just return empty list return keywords_extracted - if not self.case_sensitive: - sentence = sentence.lower() current_dict = self.keyword_trie_dict sequence_start_pos = 0 sequence_end_pos = 0 @@ -486,6 +485,8 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0): curr_cost = max_cost while idx < sentence_len: char = sentence[idx] + if not self.case_sensitive: + char = char.lower() # when we reach a character that might denote word end if char not in self.non_word_boundaries: @@ -507,6 +508,8 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0): idy = idx + 1 while idy < sentence_len: inner_char = sentence[idy] + if not self.case_sensitive: + inner_char = inner_char.lower() if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued: # update longest sequence found longest_sequence_found = current_dict_continued[self._keyword] @@ -564,6 +567,8 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0): idy = idx + 1 while idy < sentence_len: char = sentence[idy] + if not self.case_sensitive: + char = char.lower() if char not in self.non_word_boundaries: break idy += 1 @@ -606,8 +611,6 @@ def replace_keywords(self, sentence, max_cost=0): return sentence new_sentence = [] orig_sentence = sentence - if not self.case_sensitive: - sentence = sentence.lower() current_word = '' current_dict = self.keyword_trie_dict current_white_space = '' @@ -617,6 +620,8 @@ def replace_keywords(self, sentence, max_cost=0): curr_cost = max_cost while idx < sentence_len: char = sentence[idx] + if not self.case_sensitive: + char = char.lower() # when we reach whitespace if char not in self.non_word_boundaries: current_word += orig_sentence[idx] @@ -639,6 +644,8 @@ def replace_keywords(self, sentence, max_cost=0): idy = idx + 1 while idy < sentence_len: inner_char = sentence[idy] + if not self.case_sensitive: + inner_char = inner_char.lower() if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued: current_word_continued += orig_sentence[idy] # update longest sequence found @@ -712,6 +719,8 @@ def replace_keywords(self, sentence, max_cost=0): idy = idx + 1 while idy < sentence_len: char = sentence[idy] + if not self.case_sensitive: + char = char.lower() current_word += orig_sentence[idy] if char not in self.non_word_boundaries: break diff --git a/test/keyword_extractor_test_cases.json b/test/keyword_extractor_test_cases.json index 6280771..edca626 100644 --- a/test/keyword_extractor_test_cases.json +++ b/test/keyword_extractor_test_cases.json @@ -476,5 +476,15 @@ "explanation": "", "keywords": ["spring framework"], "keywords_case_sensitive": ["spring framework"] + }, + { + "sentence": "İ love Big Apple and Bay Area.", + "keyword_dict": { + "İ love": ["İ love"], + "Big Apple": ["Big Apple"] + }, + "explanation": "Lowering keywords per character for correct span_info", + "keywords": ["İ love", "Big Apple"], + "keywords_case_sensitive": ["İ love", "Big Apple"] } ] diff --git a/test/test_kp_extract_span.py b/test/test_kp_extract_span.py index 2b9f7a4..d549d18 100644 --- a/test/test_kp_extract_span.py +++ b/test/test_kp_extract_span.py @@ -19,7 +19,6 @@ def test_extract_keywords(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Extract keywords and check if they match the expected result for the test case. - """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() @@ -27,16 +26,15 @@ def test_extract_keywords(self): keyword_processor.add_keywords_from_list(test_case['keyword_dict'][key]) keywords_extracted = keyword_processor.extract_keywords(test_case['sentence'], span_info=True) for kwd in keywords_extracted: - # returned keyword lowered should match the sapn from sentence + # returned keyword lowered should match the span from sentence self.assertEqual( - kwd[0].lower(), test_case['sentence'].lower()[kwd[1]:kwd[2]], + kwd[0].lower(), test_case['sentence'][kwd[1]:kwd[2]].lower(), "keywords span don't match the expected results for test case: {}".format(test_id)) def test_extract_keywords_case_sensitive(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Extract keywords and check if they match the expected result for the test case. - """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor(case_sensitive=True)