vi3k6i5 · lishukan · Sep 21, 2023 · Sep 21, 2023 · Oct 23, 2023 · Jul 3, 2024
diff --git a/flashtext/keyword.py b/flashtext/keyword.py
@@ -487,13 +487,12 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
         while idx < sentence_len:
             char = sentence[idx]
             # when we reach a character that might denote word end
+            longest_sequence_found = None
             if char not in self.non_word_boundaries:
-
                 # if end is present in current_dict
                 if self._keyword in current_dict or char in current_dict:
                     # update longest sequence found
                     sequence_found = None
-                    longest_sequence_found = None
                     is_longer_seq_found = False
                     if self._keyword in current_dict:
                         sequence_found = current_dict[self._keyword]
@@ -503,7 +502,6 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
                     # re look for longest_sequence from this position
                     if char in current_dict:
                         current_dict_continued = current_dict[char]
-
                         idy = idx + 1
                         while idy < sentence_len:
                             inner_char = sentence[idy]
@@ -525,6 +523,13 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
                                 if not current_dict_continued:
                                     break
                             else:
+                                # 匹配到了一个词，且这个词的最后一个字符是不在non_word_boundaries，但是这个词的下一个字符在current_dict_continued中
+                                # 例如,匹配 孤儿药DFA  中的孤儿药
+                                if self._keyword in current_dict_continued and current_dict_continued[self._keyword][-1] not in self.non_word_boundaries and inner_char in self.non_word_boundaries:
+                                    # update longest sequence found
+                                    longest_sequence_found = current_dict_continued[self._keyword]
+                                    sequence_end_pos = idy
+                                    is_longer_seq_found = True
                                 break
                             idy += 1
                         else:
@@ -567,7 +572,7 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
                     if char not in self.non_word_boundaries:
                         break
                     idy += 1
-                idx = idy
+                idx = idy-1
             # if we are end of sentence and have a sequence discovered
             if idx + 1 >= sentence_len:
                 if self._keyword in current_dict:
@@ -576,6 +581,8 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
             idx += 1
             if reset_current_dict:
                 reset_current_dict = False
+                if longest_sequence_found:
+                    idx -= 1
                 sequence_start_pos = idx
         if span_info:
             return keywords_extracted

diff --git a/test/keyword_extractor_test_cases.json b/test/keyword_extractor_test_cases.json
@@ -476,5 +476,24 @@
         "explanation": "",
         "keywords": ["spring framework"],
         "keywords_case_sensitive": ["spring framework"]
+    },
+    {
+        "sentence": "苹果香蕉的英文单词是apple banana",
+        "keyword_dict": {
+            "苹果": ["苹果"],
+            "香蕉": ["香蕉"]
+        },
+        "explanation": "Chinese word test",
+        "keywords": ["苹果", "香蕉"],
+        "keywords_case_sensitive": ["苹果","香蕉"]
+    },
+    {
+        "sentence": "拓新天成的TX103产品已经获得中国发明专利和国际专利授权。此外，该产品治疗恶性脑胶质瘤已于今年6月获得了FDA孤儿药资格认定。",
+        "keyword_dict": {
+            "孤儿药": ["孤儿药"]
+        },
+        "explanation": "Chinese word test",
+        "keywords": ["孤儿药"],
+        "keywords_case_sensitive": ["孤儿药"]
     }
 ]