Changed migrations to better parse RW annotations

UAlbertaALTLab · Dec 10, 2024 · 32bab40 · 32bab40
1 parent 69dcfa7
commit 32bab40
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 6 deletions.
diff --git a/src/morphodict/lexicon/migrations/0015_auto_20241128_2351.py b/src/morphodict/lexicon/migrations/0015_auto_20241128_2351.py
@@ -12,7 +12,6 @@ def migrate_semantic_domains(apps, schema_editor):
     # For every wordform, collect the semantic domain information in the old
     # format and place it where it belongs.
     wordforms = Wordform.objects.all()
-    count = wordforms.count()
     for wf in wordforms:
         if wf.rw_indices:
             rapidwords = [x.strip() for x in wf.rw_indices.split(";")]

diff --git a/src/morphodict/lexicon/migrations/0016_auto_20241202_1907.py b/src/morphodict/lexicon/migrations/0016_auto_20241202_1907.py
@@ -2,6 +2,7 @@
 
 from django.db import migrations
 from morphodict.search.types import WordnetEntry
+from django.db.models import Q
 
 
 def migrate_from_linguistinfo(apps, schema_editor):
@@ -12,7 +13,6 @@ def migrate_from_linguistinfo(apps, schema_editor):
     # For every wordform, collect the semantic domain information in the old
     # format and place it where it belongs.
     wordforms = Wordform.objects.all()
-    count = wordforms.count()
     for wf in wordforms:
         if not wf.linguist_info:
             continue
@@ -23,11 +23,35 @@ def migrate_from_linguistinfo(apps, schema_editor):
             for rw in rapidwords:
                 index = rw.strip()
                 try:
-                    wf.rapidwords.add(RapidWords.objects.get(index=index))
+                    rapidword = RapidWords.objects.get(index=index)
                 except RapidWords.DoesNotExist:
-                    print(
-                        f"ERROR: Slug {wf.slug} is annotated with nonexistent {index} RW index"
-                    )
+                    # Try flexible search
+                    try:
+                        try:
+                            candidates = [
+                                RapidWords.objects.get(
+                                    index=".".join(index.split(".")[:-1])
+                                )
+                            ]
+                        except RapidWords.DoesNotExist:
+                            query = Q(domain__iexact=wf.linguist_info["rw_domains"][0])
+                            for domain in wf.linguist_info["rw_domains"][1:]:
+                                query |= Q(domain__iexact=domain)
+                            universe = RapidWords.objects.filter(query)
+                            candidates = [
+                                x for x in universe if index.startswith(x.index)
+                            ]
+                    except:
+                        candidates = []
+                    if len(candidates) > 0:
+                        candidates.sort(key=lambda x: len(x.index), reverse=True)
+                        rapidword = candidates[0]
+                    else:
+                        print(
+                            f"WARNING: ImportJSON error: Slug {wf.slug} is annotated with nonexistent {index} RW index"
+                        )
+                if rapidword:
+                    wf.rapidwords.add(rapidword)
 
         if "wn_domains" in wf.linguist_info:
             for wn in wf.linguist_info["wn_domains"]: