From f1eb4e208fd127b9c80cc6d13a8a48d10f2c64fb Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 6 Jun 2023 20:32:36 +0300
Subject: [PATCH 01/10] Fixes

---
 lingvodoc/schema/query.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index ce1a2f6f..8c6bb3a7 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13035,13 +13035,17 @@ def export_dataframe(result_pool, distance_data_array, bundles):
         'translation': translation_lex
         '''
 
-        groups = pd.DataFrame()
-        singles = pd.DataFrame()
         distances = pd.DataFrame(distance_data_array,
                                  columns=[perspective['name'] for perspective in result_pool.values()])
         # Start index for distances from 1 to match with dictionaries numbers
         distances.index += 1
 
+        groups = pd.DataFrame()
+        # Insert 'lines' column as the first one
+        groups['lines'] = 0
+
+        singles = pd.DataFrame()
+
         row_index = 0
         # re-group by group number and add joined values
         for perspective in result_pool.values():
@@ -13053,7 +13057,7 @@ def export_dataframe(result_pool, distance_data_array, bundles):
                     continue
                 group_num = entry['group']
                 entry_text = f"{entry['swadesh']} [ {entry['transcription']} ] {entry['translation']}"
-                if group_num and group_num in bundles:
+                if group_num is not None and group_num in bundles:
                     # Concatinate existing value if is and a new one, store the result to 'groups' dataframe
                     value = ""
                     if dict_name in groups:
@@ -13080,7 +13084,6 @@ def export_dataframe(result_pool, distance_data_array, bundles):
     @staticmethod
     def export_xlsx(
             result,
-            columns,
             base_language_name,
             storage
     ):
@@ -13112,6 +13115,8 @@ def export_xlsx(
             for sheet_name, df in result.items():
                 index = (sheet_name == 'Distances')
                 startcol = int(index)
+                # Exclude 'lines' column
+                columns = df.columns[int(sheet_name == 'Cognates'):]
 
                 df.to_excel(writer,
                             sheet_name=sheet_name,
@@ -13364,7 +13369,7 @@ def split_lex(lex):
         # GC
         del result_pool
 
-        xlsx_url = SwadeshAnalysis.export_xlsx(result, distance_header_array, base_language_name, storage)
+        xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage)
 
         # 'lines' field is not needed any more
         del result['Cognates']['lines']

From 8d25a943acc3c3d4122d7440aa7a690c0254e6de Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 6 Jun 2023 20:53:02 +0300
Subject: [PATCH 02/10] Sorting

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 8c6bb3a7..4fd7c47b 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13076,7 +13076,7 @@ def export_dataframe(result_pool, distance_data_array, bundles):
                     row_index += 1
 
         return {
-            'Cognates': groups if groups.empty else groups.sort_values(groups.columns[0]),
+            'Cognates': groups if len(groups) < 2 else groups.sort_values(groups.columns[1]),
             'Singles': singles.sort_index(),
             'Distances': distances.sort_index()
         }

From 2621681799bf2a1cfb0747cb78279bdc67412b26 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 7 Jun 2023 16:55:47 +0300
Subject: [PATCH 03/10] Multi-language

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 4fd7c47b..d410b263 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13355,7 +13355,7 @@ def split_lex(lex):
 
                     means_total = len(swadesh_total[perspective1] & swadesh_total[perspective2])
 
-                    if n2 > n1 and len(means_common) > 0:
+                    if n2 > n1 and means_linked >= means_total:
                         log.debug(f"{n1+1},{n2+1} : "
                                   f"{len(means_common)} but {means_linked} of {means_total} : "
                                   f"{', '.join(sorted(means_common))}")

From 8eeb6cf91a2155fcf1bdc7723f726a828090f680 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 7 Jun 2023 17:20:44 +0300
Subject: [PATCH 04/10] Check if table is empty

---
 lingvodoc/schema/query.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index d410b263..03c8f137 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13117,6 +13117,9 @@ def export_xlsx(
                 startcol = int(index)
                 # Exclude 'lines' column
                 columns = df.columns[int(sheet_name == 'Cognates'):]
+                # Check if the table is empty
+                if columns.empty:
+                    continue
 
                 df.to_excel(writer,
                             sheet_name=sheet_name,

From c55900da6c7dabca8e6645f3d9c1fb2b4c1a7066 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 20 Jun 2023 17:21:55 +0300
Subject: [PATCH 05/10] Table with borrowed words

---
 lingvodoc/schema/query.py | 67 +++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 23 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 03c8f137..eb2565a9 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13044,9 +13044,11 @@ def export_dataframe(result_pool, distance_data_array, bundles):
         # Insert 'lines' column as the first one
         groups['lines'] = 0
 
+        borrowed = pd.DataFrame()
         singles = pd.DataFrame()
 
-        row_index = 0
+        singles_index = 0
+        borrowed_index = 0
         # re-group by group number and add joined values
         for perspective in result_pool.values():
             dict_name = perspective['name']
@@ -13071,13 +13073,17 @@ def export_dataframe(result_pool, distance_data_array, bundles):
                     cell = groups.loc[group_num].get('lines')
                     if pd.isnull(cell) or cell < lines:
                         groups.loc[group_num, 'lines'] = lines
+                elif entry['borrowed']:
+                    borrowed.loc[borrowed_index, dict_name] = entry_text
+                    borrowed_index += 1
                 else:
-                    singles.loc[row_index, dict_name] = entry_text
-                    row_index += 1
+                    singles.loc[singles_index, dict_name] = entry_text
+                    singles_index += 1
 
         return {
             'Cognates': groups if len(groups) < 2 else groups.sort_values(groups.columns[1]),
             'Singles': singles.sort_index(),
+            'Borrowed': borrowed.sort_index(),
             'Distances': distances.sort_index()
         }
 
@@ -13146,6 +13152,39 @@ def export_xlsx(
 
         return xlsx_url
 
+    @staticmethod
+    def export_html(result, tiny_dicts=None, huge_size=1048576):
+        result_tables = (
+            build_table(result['Distances'], 'orange_light', width="300px", index=True),
+            build_table(result['Cognates'], 'blue_light', width="300px").replace("\\n","<br>"),
+            build_table(result['Singles'], 'green_light', width="300px"),
+            build_table(result['Borrowed'], 'yellow_light', width="300px"))
+
+        # Control output size
+        spl = "<pre>\n\n</pre>"
+        html_result = f"{result_tables[0]}" \
+                      f"{spl}" \
+                      f"{result_tables[1]}" \
+                      f"{spl}" \
+                      f"{result_tables[2]}" \
+                      f"{spl}" \
+                      f"{result_tables[3]}"
+
+        if len(html_result) > huge_size:
+            html_result = f"{result_tables[0]}" \
+                          f"{spl}" \
+                          f"{result_tables[1]}" \
+                          f"<pre>\n\nNote: The table with single words is not shown due to huge summary size</pre>"
+
+        if len(html_result) > huge_size:
+            html_result = f"{result_tables[0]}" \
+                          f"<pre>\n\nNote: The result tables with words are not shown due to huge summary size</pre>"
+
+        html_result += ("<pre>Note: The following dictionaries contain too less words and were not processed: \n\n" +
+                        '\n'.join(tiny_dicts) + "</pre>") if tiny_dicts else ""
+
+        return html_result
+
     @staticmethod
     def swadesh_statistics(
             language_str,
@@ -13377,25 +13416,7 @@ def split_lex(lex):
         # 'lines' field is not needed any more
         del result['Cognates']['lines']
 
-        result_tables = (
-            build_table(result['Distances'], 'orange_light', width="300px", index=True),
-            build_table(result['Cognates'], 'blue_light', width="300px").replace("\\n","<br>"),
-            build_table(result['Singles'], 'green_light', width="300px"))
-
-        # Control output size
-        huge_size = 1048576
-        result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}<pre>\n\n</pre>{result_tables[2]}"
-        if len(result) > huge_size:
-            result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}" \
-                     f"<pre>\n\nNote: The table with single words is not shown due to huge summary size</pre>"
-        if len(result) > huge_size:
-            result = f"{result_tables[0]}" \
-                     f"<pre>\n\nNote: The result tables with words are not shown due to huge summary size</pre>"
-        result += ("<pre>Note: The following dictionaries contain too less words and were not processed: \n\n" +
-                   '\n'.join(tiny_dicts) + "</pre>") if tiny_dicts else ""
-
-        # GC
-        del result_tables
+        html_result = SwadeshAnalysis.export_html(result, tiny_dicts)
 
         _, mst_list, embedding_2d_pca, embedding_3d_pca = \
             CognateAnalysis.distance_graph(
@@ -13413,7 +13434,7 @@ def split_lex(lex):
             dict(
                 triumph = True,
 
-                result = result,
+                result = html_result,
                 xlsx_url = xlsx_url,
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,

From d38c07682bac830682ec53adb0867ca50d53c0f2 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 22 Jun 2023 11:47:02 +0300
Subject: [PATCH 06/10] Excluded exclusions

---
 lingvodoc/schema/query.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index eb2565a9..82233d7f 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13209,6 +13209,8 @@ def compare_translations(swadesh_lex, dictionary_lex):
             def split_lex(lex):
                 # Split by commas and open brackets to separate
                 # various forms of lexeme and extra note if is
+                if "убрать из стословника" in lex.lower():
+                    return set()
                 return set(f" {form}".lower().replace(" заим.", "").strip()
                            for form in lex.replace('(', ',').split(',')
                            if form.strip()

From a67067f5ba898b856b76480f8b08bba20e2d46b9 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 22 Jun 2023 12:13:49 +0300
Subject: [PATCH 07/10] Reduce multi spaces

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 82233d7f..63834b0f 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13209,7 +13209,7 @@ def compare_translations(swadesh_lex, dictionary_lex):
             def split_lex(lex):
                 # Split by commas and open brackets to separate
                 # various forms of lexeme and extra note if is
-                if "убрать из стословника" in lex.lower():
+                if "убрать из стословника" in ' '.join(lex.lower().split()):
                     return set()
                 return set(f" {form}".lower().replace(" заим.", "").strip()
                            for form in lex.replace('(', ',').split(',')

From cb01029cc966f82b0da1a63c94e57018c2b68ed9 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 22 Jun 2023 12:31:31 +0300
Subject: [PATCH 08/10] Refactoring

---
 lingvodoc/schema/query.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 63834b0f..4dee5ee5 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13209,15 +13209,18 @@ def compare_translations(swadesh_lex, dictionary_lex):
             def split_lex(lex):
                 # Split by commas and open brackets to separate
                 # various forms of lexeme and extra note if is
-                if "убрать из стословника" in ' '.join(lex.lower().split()):
+                lex = ' '.join(lex.lower().split()) # reduce multi spaces
+                if "убрать из стословника" in lex:
                     return set()
-                return set(f" {form}".lower().replace(" заим.", "").strip()
+
+                return set(form.strip()
                            for form in lex.replace('(', ',').split(',')
-                           if form.strip()
-                           and ')' not in form)  # exclude notes
+                           if form.strip() and ')' not in form)  # exclude notes
+
             # return true if the intersection is not empty
             return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex))
 
+
         _, group_list, _ = (
             CognateAnalysis.tag_data_plpgsql(
                 perspective_info_list, group_field_id))

From 609984b8349594845d1b372eea1630bb2af42ced Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 5 Jul 2023 19:02:14 +0300
Subject: [PATCH 09/10] Percent

---
 lingvodoc/schema/query.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 89dc5ace..11fe0b6b 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13448,6 +13448,7 @@ def split_lex(lex):
 
         dictionary_count = len(means)
         distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float')
+        complex_data_array = numpy.full((dictionary_count, dictionary_count), "n/a", dtype='object')
         distance_header_array = numpy.full(dictionary_count, "<noname>", dtype='object')
 
         # Calculate intersection between lists of linked means (Swadesh matching)
@@ -13462,6 +13463,7 @@ def split_lex(lex):
             for n2, (perspective2, means2) in enumerate(means.items()):
                 if n1 == n2:
                     distance_data_array[n1][n2] = 0
+                    complex_data_array[n1][n2] = "n/a"
                 else:
                     # Common means of entries which have etimological linkes
                     # but this linkes may be not mutual
@@ -13484,9 +13486,11 @@ def split_lex(lex):
 
                     # means_linked > 0 means that means_total > 0 even more so
                     distance = math.log(means_linked / means_total) / -0.14 if means_linked > 0 else 50
+                    percent = means_linked * 100 // means_total if means_total > 0 else 0
                     distance_data_array[n1][n2] = round(distance, 2)
+                    complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]} ({percent}%)"
 
-        result = SwadeshAnalysis.export_dataframe(result_pool, distance_data_array, bundles)
+        result = SwadeshAnalysis.export_dataframe(result_pool, complex_data_array, bundles)
 
         # GC
         del result_pool

From 1d7288aab54611c8f7cfb9562053154f4fe817a4 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 5 Jul 2023 19:24:00 +0300
Subject: [PATCH 10/10] Print strictly two decimal digits

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 11fe0b6b..31f19c74 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13488,7 +13488,7 @@ def split_lex(lex):
                     distance = math.log(means_linked / means_total) / -0.14 if means_linked > 0 else 50
                     percent = means_linked * 100 // means_total if means_total > 0 else 0
                     distance_data_array[n1][n2] = round(distance, 2)
-                    complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]} ({percent}%)"
+                    complex_data_array[n1][n2] = f"{distance_data_array[n1][n2]:.2f} ({percent}%)"
 
         result = SwadeshAnalysis.export_dataframe(result_pool, complex_data_array, bundles)