diff --git a/flashtext/keyword.py b/flashtext/keyword.py index b5cb1d1..7efcc83 100644 --- a/flashtext/keyword.py +++ b/flashtext/keyword.py @@ -487,13 +487,12 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0): while idx < sentence_len: char = sentence[idx] # when we reach a character that might denote word end + longest_sequence_found = None if char not in self.non_word_boundaries: - # if end is present in current_dict if self._keyword in current_dict or char in current_dict: # update longest sequence found sequence_found = None - longest_sequence_found = None is_longer_seq_found = False if self._keyword in current_dict: sequence_found = current_dict[self._keyword] @@ -503,7 +502,6 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0): # re look for longest_sequence from this position if char in current_dict: current_dict_continued = current_dict[char] - idy = idx + 1 while idy < sentence_len: inner_char = sentence[idy] @@ -525,6 +523,13 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0): if not current_dict_continued: break else: + # 匹配到了一个词,且这个词的最后一个字符是不在non_word_boundaries,但是这个词的下一个字符在current_dict_continued中 + # 例如,匹配 孤儿药DFA 中的孤儿药 + if self._keyword in current_dict_continued and current_dict_continued[self._keyword][-1] not in self.non_word_boundaries and inner_char in self.non_word_boundaries: + # update longest sequence found + longest_sequence_found = current_dict_continued[self._keyword] + sequence_end_pos = idy + is_longer_seq_found = True break idy += 1 else: @@ -567,7 +572,7 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0): if char not in self.non_word_boundaries: break idy += 1 - idx = idy + idx = idy-1 # if we are end of sentence and have a sequence discovered if idx + 1 >= sentence_len: if self._keyword in current_dict: @@ -576,6 +581,8 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0): idx += 1 if reset_current_dict: reset_current_dict = False + if longest_sequence_found: + idx -= 1 sequence_start_pos = idx if span_info: return keywords_extracted diff --git a/test/keyword_extractor_test_cases.json b/test/keyword_extractor_test_cases.json index 6280771..a1a43e9 100644 --- a/test/keyword_extractor_test_cases.json +++ b/test/keyword_extractor_test_cases.json @@ -476,5 +476,24 @@ "explanation": "", "keywords": ["spring framework"], "keywords_case_sensitive": ["spring framework"] + }, + { + "sentence": "苹果香蕉的英文单词是apple banana", + "keyword_dict": { + "苹果": ["苹果"], + "香蕉": ["香蕉"] + }, + "explanation": "Chinese word test", + "keywords": ["苹果", "香蕉"], + "keywords_case_sensitive": ["苹果","香蕉"] + }, + { + "sentence": "拓新天成的TX103产品已经获得中国发明专利和国际专利授权。此外,该产品治疗恶性脑胶质瘤已于今年6月获得了FDA孤儿药资格认定。", + "keyword_dict": { + "孤儿药": ["孤儿药"] + }, + "explanation": "Chinese word test", + "keywords": ["孤儿药"], + "keywords_case_sensitive": ["孤儿药"] } ]