From f1eb4e208fd127b9c80cc6d13a8a48d10f2c64fb Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 6 Jun 2023 20:32:36 +0300 Subject: [PATCH 01/10] Fixes --- lingvodoc/schema/query.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index ce1a2f6f..8c6bb3a7 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13035,13 +13035,17 @@ def export_dataframe(result_pool, distance_data_array, bundles): 'translation': translation_lex ''' - groups = pd.DataFrame() - singles = pd.DataFrame() distances = pd.DataFrame(distance_data_array, columns=[perspective['name'] for perspective in result_pool.values()]) # Start index for distances from 1 to match with dictionaries numbers distances.index += 1 + groups = pd.DataFrame() + # Insert 'lines' column as the first one + groups['lines'] = 0 + + singles = pd.DataFrame() + row_index = 0 # re-group by group number and add joined values for perspective in result_pool.values(): @@ -13053,7 +13057,7 @@ def export_dataframe(result_pool, distance_data_array, bundles): continue group_num = entry['group'] entry_text = f"{entry['swadesh']} [ {entry['transcription']} ] {entry['translation']}" - if group_num and group_num in bundles: + if group_num is not None and group_num in bundles: # Concatinate existing value if is and a new one, store the result to 'groups' dataframe value = "" if dict_name in groups: @@ -13080,7 +13084,6 @@ def export_dataframe(result_pool, distance_data_array, bundles): @staticmethod def export_xlsx( result, - columns, base_language_name, storage ): @@ -13112,6 +13115,8 @@ def export_xlsx( for sheet_name, df in result.items(): index = (sheet_name == 'Distances') startcol = int(index) + # Exclude 'lines' column + columns = df.columns[int(sheet_name == 'Cognates'):] df.to_excel(writer, sheet_name=sheet_name, @@ -13364,7 +13369,7 @@ def split_lex(lex): # GC del result_pool - xlsx_url = SwadeshAnalysis.export_xlsx(result, distance_header_array, base_language_name, storage) + xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) # 'lines' field is not needed any more del result['Cognates']['lines'] From 8d25a943acc3c3d4122d7440aa7a690c0254e6de Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 6 Jun 2023 20:53:02 +0300 Subject: [PATCH 02/10] Sorting --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 8c6bb3a7..4fd7c47b 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13076,7 +13076,7 @@ def export_dataframe(result_pool, distance_data_array, bundles): row_index += 1 return { - 'Cognates': groups if groups.empty else groups.sort_values(groups.columns[0]), + 'Cognates': groups if len(groups) < 2 else groups.sort_values(groups.columns[1]), 'Singles': singles.sort_index(), 'Distances': distances.sort_index() } From 2621681799bf2a1cfb0747cb78279bdc67412b26 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 7 Jun 2023 16:55:47 +0300 Subject: [PATCH 03/10] Multi-language --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 4fd7c47b..d410b263 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13355,7 +13355,7 @@ def split_lex(lex): means_total = len(swadesh_total[perspective1] & swadesh_total[perspective2]) - if n2 > n1 and len(means_common) > 0: + if n2 > n1 and means_linked >= means_total: log.debug(f"{n1+1},{n2+1} : " f"{len(means_common)} but {means_linked} of {means_total} : " f"{', '.join(sorted(means_common))}") From 8eeb6cf91a2155fcf1bdc7723f726a828090f680 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 7 Jun 2023 17:20:44 +0300 Subject: [PATCH 04/10] Check if table is empty --- lingvodoc/schema/query.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index d410b263..03c8f137 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13117,6 +13117,9 @@ def export_xlsx( startcol = int(index) # Exclude 'lines' column columns = df.columns[int(sheet_name == 'Cognates'):] + # Check if the table is empty + if columns.empty: + continue df.to_excel(writer, sheet_name=sheet_name, From c55900da6c7dabca8e6645f3d9c1fb2b4c1a7066 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 20 Jun 2023 17:21:55 +0300 Subject: [PATCH 05/10] Table with borrowed words --- lingvodoc/schema/query.py | 67 +++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 03c8f137..eb2565a9 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13044,9 +13044,11 @@ def export_dataframe(result_pool, distance_data_array, bundles): # Insert 'lines' column as the first one groups['lines'] = 0 + borrowed = pd.DataFrame() singles = pd.DataFrame() - row_index = 0 + singles_index = 0 + borrowed_index = 0 # re-group by group number and add joined values for perspective in result_pool.values(): dict_name = perspective['name'] @@ -13071,13 +13073,17 @@ def export_dataframe(result_pool, distance_data_array, bundles): cell = groups.loc[group_num].get('lines') if pd.isnull(cell) or cell < lines: groups.loc[group_num, 'lines'] = lines + elif entry['borrowed']: + borrowed.loc[borrowed_index, dict_name] = entry_text + borrowed_index += 1 else: - singles.loc[row_index, dict_name] = entry_text - row_index += 1 + singles.loc[singles_index, dict_name] = entry_text + singles_index += 1 return { 'Cognates': groups if len(groups) < 2 else groups.sort_values(groups.columns[1]), 'Singles': singles.sort_index(), + 'Borrowed': borrowed.sort_index(), 'Distances': distances.sort_index() } @@ -13146,6 +13152,39 @@ def export_xlsx( return xlsx_url + @staticmethod + def export_html(result, tiny_dicts=None, huge_size=1048576): + result_tables = ( + build_table(result['Distances'], 'orange_light', width="300px", index=True), + build_table(result['Cognates'], 'blue_light', width="300px").replace("\\n","
"), + build_table(result['Singles'], 'green_light', width="300px"), + build_table(result['Borrowed'], 'yellow_light', width="300px")) + + # Control output size + spl = "
\n\n
" + html_result = f"{result_tables[0]}" \ + f"{spl}" \ + f"{result_tables[1]}" \ + f"{spl}" \ + f"{result_tables[2]}" \ + f"{spl}" \ + f"{result_tables[3]}" + + if len(html_result) > huge_size: + html_result = f"{result_tables[0]}" \ + f"{spl}" \ + f"{result_tables[1]}" \ + f"
\n\nNote: The table with single words is not shown due to huge summary size
" + + if len(html_result) > huge_size: + html_result = f"{result_tables[0]}" \ + f"
\n\nNote: The result tables with words are not shown due to huge summary size
" + + html_result += ("
Note: The following dictionaries contain too less words and were not processed: \n\n" +
+                        '\n'.join(tiny_dicts) + "
") if tiny_dicts else "" + + return html_result + @staticmethod def swadesh_statistics( language_str, @@ -13377,25 +13416,7 @@ def split_lex(lex): # 'lines' field is not needed any more del result['Cognates']['lines'] - result_tables = ( - build_table(result['Distances'], 'orange_light', width="300px", index=True), - build_table(result['Cognates'], 'blue_light', width="300px").replace("\\n","
"), - build_table(result['Singles'], 'green_light', width="300px")) - - # Control output size - huge_size = 1048576 - result = f"{result_tables[0]}
\n\n
{result_tables[1]}
\n\n
{result_tables[2]}" - if len(result) > huge_size: - result = f"{result_tables[0]}
\n\n
{result_tables[1]}" \ - f"
\n\nNote: The table with single words is not shown due to huge summary size
" - if len(result) > huge_size: - result = f"{result_tables[0]}" \ - f"
\n\nNote: The result tables with words are not shown due to huge summary size
" - result += ("
Note: The following dictionaries contain too less words and were not processed: \n\n" +
-                   '\n'.join(tiny_dicts) + "
") if tiny_dicts else "" - - # GC - del result_tables + html_result = SwadeshAnalysis.export_html(result, tiny_dicts) _, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( @@ -13413,7 +13434,7 @@ def split_lex(lex): dict( triumph = True, - result = result, + result = html_result, xlsx_url = xlsx_url, minimum_spanning_tree = mst_list, embedding_2d = embedding_2d_pca, From d38c07682bac830682ec53adb0867ca50d53c0f2 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 22 Jun 2023 11:47:02 +0300 Subject: [PATCH 06/10] Excluded exclusions --- lingvodoc/schema/query.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index eb2565a9..82233d7f 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13209,6 +13209,8 @@ def compare_translations(swadesh_lex, dictionary_lex): def split_lex(lex): # Split by commas and open brackets to separate # various forms of lexeme and extra note if is + if "убрать из стословника" in lex.lower(): + return set() return set(f" {form}".lower().replace(" заим.", "").strip() for form in lex.replace('(', ',').split(',') if form.strip() From a67067f5ba898b856b76480f8b08bba20e2d46b9 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 22 Jun 2023 12:13:49 +0300 Subject: [PATCH 07/10] Reduce multi spaces --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 82233d7f..63834b0f 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13209,7 +13209,7 @@ def compare_translations(swadesh_lex, dictionary_lex): def split_lex(lex): # Split by commas and open brackets to separate # various forms of lexeme and extra note if is - if "убрать из стословника" in lex.lower(): + if "убрать из стословника" in ' '.join(lex.lower().split()): return set() return set(f" {form}".lower().replace(" заим.", "").strip() for form in lex.replace('(', ',').split(',') From cb01029cc966f82b0da1a63c94e57018c2b68ed9 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 22 Jun 2023 12:31:31 +0300 Subject: [PATCH 08/10] Refactoring --- lingvodoc/schema/query.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 63834b0f..4dee5ee5 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13209,15 +13209,18 @@ def compare_translations(swadesh_lex, dictionary_lex): def split_lex(lex): # Split by commas and open brackets to separate # various forms of lexeme and extra note if is - if "убрать из стословника" in ' '.join(lex.lower().split()): + lex = ' '.join(lex.lower().split()) # reduce multi spaces + if "убрать из стословника" in lex: return set() - return set(f" {form}".lower().replace(" заим.", "").strip() + + return set(form.strip() for form in lex.replace('(', ',').split(',') - if form.strip() - and ')' not in form) # exclude notes + if form.strip() and ')' not in form) # exclude notes + # return true if the intersection is not empty return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex)) + _, group_list, _ = ( CognateAnalysis.tag_data_plpgsql( perspective_info_list, group_field_id)) From 609984b8349594845d1b372eea1630bb2af42ced Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 5 Jul 2023 19:02:14 +0300 Subject: [PATCH 09/10] Percent --- lingvodoc/schema/query.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 89dc5ace..11fe0b6b 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13448,6 +13448,7 @@ def split_lex(lex): dictionary_count = len(means) distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float') + complex_data_array = numpy.full((dictionary_count, dictionary_count), "n/a", dtype='object') distance_header_array = numpy.full(dictionary_count, "", dtype='object') # Calculate intersection between lists of linked means (Swadesh matching) @@ -13462,6 +13463,7 @@ def split_lex(lex): for n2, (perspective2, means2) in enumerate(means.items()): if n1 == n2: distance_data_array[n1][n2] = 0 + complex_data_array[n1][n2] = "n/a" else: # Common means of entries which have etimological linkes # but this linkes may be not mutual @@ -13484,9 +13486,11 @@ def split_lex(lex): # means_linked > 0 means that means_total > 0 even more so distance = math.log(means_linked / means_total) / -0.14 if means_linked > 0 else 50 + percent = means_linked * 100 // means_total if means_total > 0 else 0 distance_data_array[n1][n2] = round(distance, 2) + complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]} ({percent}%)" - result = SwadeshAnalysis.export_dataframe(result_pool, distance_data_array, bundles) + result = SwadeshAnalysis.export_dataframe(result_pool, complex_data_array, bundles) # GC del result_pool From 1d7288aab54611c8f7cfb9562053154f4fe817a4 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 5 Jul 2023 19:24:00 +0300 Subject: [PATCH 10/10] Print strictly two decimal digits --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 11fe0b6b..31f19c74 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13488,7 +13488,7 @@ def split_lex(lex): distance = math.log(means_linked / means_total) / -0.14 if means_linked > 0 else 50 percent = means_linked * 100 // means_total if means_total > 0 else 0 distance_data_array[n1][n2] = round(distance, 2) - complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]} ({percent}%)" + complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)" result = SwadeshAnalysis.export_dataframe(result_pool, complex_data_array, bundles)