Skip to content

Commit

Permalink
Search engine: More commenting
Browse files Browse the repository at this point in the history
  • Loading branch information
mjuhanne committed Nov 12, 2023
1 parent 4066a03 commit 82fc11b
Showing 1 changed file with 22 additions and 30 deletions.
52 changes: 22 additions & 30 deletions addon/search_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,10 @@ def csv_to_list(csv):
]
sql_joins_txt = "".join(joins)


# we ignore these radicals for now because they don't have a keyword containing
# kanji counterpart in our database
ignore_radicals = ['亅','屮','禸','爻','尢','无','彑','黽','鬲','鬥','齊','龠','黹','黹','鬯']

# this table links radicals to their Heisig primitive counterpart to allow for
# even better search capability
# (some of them have identical visual look but differ only in Unicode)
unicode_conversion_table = {
# This conversion table links radicals to their Heisig primitive counterpart to allow
# for better search capability. Few of them have identical visual look but differ
# only in Unicode.
radical_conversion_table = {
'ノ' : '丿',
'|' : '丨',
'⺅' : '亻',
Expand All @@ -70,7 +65,7 @@ def csv_to_list(csv):
}

# When two characters reference each other as alternatives (for example 艹 -> 艸 and 艸 -> 艹 )
# then we want to link to the character which is the primary primitive
# then we want to link to the character which is the primary primitive.
primary_primitives = ['艹','扌','⻖','⻏','川','罒','冫','月']

class SearchEngine:
Expand Down Expand Up @@ -99,8 +94,8 @@ def __init__(self, db_cursor):
def radical_to_primitive(self,r):
# First do unicode conversion because some radicals in the list might use slightly
# # different (albeit visually indistinguishable) unicode character.
if r in unicode_conversion_table:
r = unicode_conversion_table[r]
if r in radical_conversion_table:
r = radical_conversion_table[r]
# .. then reference the main primitive instead if this is an alternative primitive
if r not in primary_primitives and r in self.primitive_alternative_cache:
r = self.primitive_alternative_cache[r]
Expand Down Expand Up @@ -131,7 +126,6 @@ def update_recursive_primitive_cache(self,character):
if p in self.keyword_set_cache:
kw_set = self.keyword_set_cache[p]
for kw in kw_set:
#if kw not in rec_primitive_names_list:
rec_primitive_names_list.append(kw)
else:
print("Note! Kanji %s references primitive %s without a keyword" % (character,p))
Expand All @@ -154,9 +148,6 @@ def init_cache(self):
r = self.radical_to_primitive(r)
if r in self.keyword_set_cache:
radical_names_set.update(self.keyword_set_cache[r])
else:
if r not in ignore_radicals:
print("Note! Kanji %s has unknown radical %s" % (c,r))
radical_names = ','.join(radical_names_set)
self.radical_name_set_cache[c] = radical_names_set
self.radical_name_cache[c] = radical_names
Expand Down Expand Up @@ -209,7 +200,7 @@ def update_cache(self, character=None):
if len(d['primitives'])>0:
self.primitive_list_cache[c] = custom_list(d['primitives'])

# Radicals..
# Radicals..
if len(d['radicals'])>0:
self.radical_set_cache[c] = set(custom_list(d['radicals']))

Expand Down Expand Up @@ -238,9 +229,9 @@ def update_cache(self, character=None):
st = d['usr_story'].lower() + d['koohi_stories'].lower()
st += d['heisig_story'].lower()
st += d['heisig_comment'].lower()

self.stories_cache[c] = st

# Frequency ranking points: Prioritize high frequency (and higher Kanken grade) kanjis.
points = 0
if d['frequency_rank'] is not None and d['frequency_rank'] != '':
fr_points = (4000 - d['frequency_rank'])/400
Expand Down Expand Up @@ -269,10 +260,9 @@ def update_cache(self, character=None):
self.update_recursive_primitive_cache(c)



def get_matching_characters(self, search_terms, pool, is_set, results, max_results):
def get_matching_characters(self, search_terms, pool, is_a_set, results, max_results):
for search_term,required_count in search_terms.items():
if is_set and required_count>1:
if is_a_set and required_count>1:
# we want more than 1 occurence but this is a set -> not found
return results

Expand All @@ -293,9 +283,9 @@ def get_matching_characters(self, search_terms, pool, is_set, results, max_resul
return results


def get_matching_characters_with_scoring(self, search_terms, pool, is_set, pool_priority, kanji_scores, kanji_matches):
def get_matching_characters_with_scoring(self, search_terms, pool, is_a_set, pool_priority, kanji_scores, kanji_matches):
for search_term,required_count in search_terms.items():
if is_set and required_count>1:
if is_a_set and required_count>1:
# we want more than 1 occurence but this is a set -> not found
return

Expand Down Expand Up @@ -323,12 +313,13 @@ def get_matching_characters_with_scoring(self, search_terms, pool, is_set, pool_
def get_matching_characters_from_list_of_pools(self, search_terms, pool_list, max_results):

if len(search_terms) == 1:
# In the case of only one search term its a simple exhaustive search until enough matches are found.
# Search goes through all search pools (keywords, primitive names, free text search) starting
# In the case of only one search term its a matter of simple exhaustive search
# until enough matches are found. Search crams through all search pools
# (keywords, primitive names, free text search) starting
# from the most prioritized one
results = []
for pool_priority, pool, is_set in pool_list:
self.get_matching_characters(search_terms, pool, is_set, results, max_results)
for pool_priority, pool, is_a_set in pool_list:
self.get_matching_characters(search_terms, pool, is_a_set, results, max_results)
if len(results)>=max_results:
return results
return results
Expand All @@ -339,8 +330,8 @@ def get_matching_characters_from_list_of_pools(self, search_terms, pool_list, ma
kanji_scores = dict() # score for each kanji
kanji_matches = dict() # how many search terms were matched

for pool_priority, pool, is_set in pool_list:
self.get_matching_characters_with_scoring(search_terms, pool, is_set, pool_priority, kanji_scores, kanji_matches)
for pool_priority, pool, is_a_set in pool_list:
self.get_matching_characters_with_scoring(search_terms, pool, is_a_set, pool_priority, kanji_scores, kanji_matches)

# remove those kanjis that didn't match all the search terms
for kanji, matched_search_terms in kanji_matches.items():
Expand Down Expand Up @@ -389,7 +380,8 @@ def search(self, search_str, max_results=15):
search_terms_dict[term] = 1

# A list of search pools, each having a distinct priority
priority_list = [
priority_list = [
# [ priority, pool, the_pool_is_a_set ]
[30,self.keyword_set_cache,True],
[26,self.keyword_cache,False],
[20,self.rec_primitive_list_cache,False],
Expand Down

0 comments on commit 82fc11b

Please sign in to comment.