Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug when extract multiple adjacent words from a string without word boundaries #142

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions flashtext/keyword.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,13 +487,12 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
while idx < sentence_len:
char = sentence[idx]
# when we reach a character that might denote word end
longest_sequence_found = None
if char not in self.non_word_boundaries:

# if end is present in current_dict
if self._keyword in current_dict or char in current_dict:
# update longest sequence found
sequence_found = None
longest_sequence_found = None
is_longer_seq_found = False
if self._keyword in current_dict:
sequence_found = current_dict[self._keyword]
Expand All @@ -503,7 +502,6 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
# re look for longest_sequence from this position
if char in current_dict:
current_dict_continued = current_dict[char]

idy = idx + 1
while idy < sentence_len:
inner_char = sentence[idy]
Expand All @@ -525,6 +523,13 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
if not current_dict_continued:
break
else:
# 匹配到了一个词,且这个词的最后一个字符是不在non_word_boundaries,但是这个词的下一个字符在current_dict_continued中
# 例如,匹配 孤儿药DFA 中的孤儿药
if self._keyword in current_dict_continued and current_dict_continued[self._keyword][-1] not in self.non_word_boundaries and inner_char in self.non_word_boundaries:
# update longest sequence found
longest_sequence_found = current_dict_continued[self._keyword]
sequence_end_pos = idy
is_longer_seq_found = True
break
idy += 1
else:
Expand Down Expand Up @@ -567,7 +572,7 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
if char not in self.non_word_boundaries:
break
idy += 1
idx = idy
idx = idy-1
# if we are end of sentence and have a sequence discovered
if idx + 1 >= sentence_len:
if self._keyword in current_dict:
Expand All @@ -576,6 +581,8 @@ def extract_keywords(self, sentence, span_info=False, max_cost=0):
idx += 1
if reset_current_dict:
reset_current_dict = False
if longest_sequence_found:
idx -= 1
sequence_start_pos = idx
if span_info:
return keywords_extracted
Expand Down
19 changes: 19 additions & 0 deletions test/keyword_extractor_test_cases.json
Original file line number Diff line number Diff line change
Expand Up @@ -476,5 +476,24 @@
"explanation": "",
"keywords": ["spring framework"],
"keywords_case_sensitive": ["spring framework"]
},
{
"sentence": "苹果香蕉的英文单词是apple banana",
"keyword_dict": {
"苹果": ["苹果"],
"香蕉": ["香蕉"]
},
"explanation": "Chinese word test",
"keywords": ["苹果", "香蕉"],
"keywords_case_sensitive": ["苹果","香蕉"]
},
{
"sentence": "拓新天成的TX103产品已经获得中国发明专利和国际专利授权。此外,该产品治疗恶性脑胶质瘤已于今年6月获得了FDA孤儿药资格认定。",
"keyword_dict": {
"孤儿药": ["孤儿药"]
},
"explanation": "Chinese word test",
"keywords": ["孤儿药"],
"keywords_case_sensitive": ["孤儿药"]
}
]