From 74bc5eafd190d8dc043646f34183fc243eb0085c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= <gael.de-chalendar@cea.fr>
Date: Thu, 23 May 2024 17:29:53 +0200
Subject: [PATCH 1/5] Correct with for python 3.7

---
 lima_linguisticdata/scripts/xmlforms.py | 114 ++++++++++++------------
 1 file changed, 56 insertions(+), 58 deletions(-)
diff --git a/lima_linguisticdata/scripts/xmlforms.py b/lima_linguisticdata/scripts/xmlforms.py
index a49a7d4a2..7a6bf7674 100644
--- a/lima_linguisticdata/scripts/xmlforms.py
+++ b/lima_linguisticdata/scripts/xmlforms.py
@@ -58,64 +58,62 @@ def main():
 
     args = parser.parse_args()
 
-    with (
-        codecs.open(args.inputfile, "r", "utf-8") as source,
-        codecs.open(args.outputfile, "a", "utf-8") as out
-    ):
-        form = ""
-        lemma = ""
-        norm = ""
-
-        count = 0
-        icount = 0
-
-        lines = source.readlines()
-        for line in tqdm(lines, desc="Processing lines", unit="line"):
-            line = line.strip()
-            line = (
-                line.replace("&", "&amp;")
-                .replace('"', "&quot;")
-                .replace("<", "&lt;")
-                .replace(">", "&gt;")
-            )
-            if line == "":
-                continue
-
-            line = line.split("#")[0].strip()
-            data = line.split("\t")
-            if line == "" or len(data) != 4:
-                print(f"xmlform: Invalid line '{line}'")
-                continue
-
-            if data[0] != form:
-                form = data[0]
-                if count > 0:
-                    out.write("  </i>\n</entry>\n")
-                out.write(f'<entry k="{form}"')
-                if args.desacc:
-                    out.write(f' desacc="{args.desacc}"')
-                if args.entryop:
-                    out.write(f' op="{args.entryop}"')
-                out.write(">\n")
-                icount = 0
-                count += 1
-
-            if icount == 0 or data[1] != lemma or data[2] != norm:
-                lemma = data[1]
-                norm = data[2]
-                if icount > 0:
-                    out.write("  </i>\n")
-                out.write("  <i")
-                if lemma:
-                    out.write(f' l="{lemma}"')
-                if norm:
-                    out.write(f' n="{norm}"')
-                if args.lingop:
-                    out.write(f' op="{args.lingop}"')
-                out.write(">\n")
-                icount += 1
-
-            out.write(f'    <p v="{data[3]}"/>\n')
+    with codecs.open(args.inputfile, "r", "utf-8") as source:
+        with codecs.open(args.outputfile, "a", "utf-8") as out:
+            form = ""
+            lemma = ""
+            norm = ""
+
+            count = 0
+            icount = 0
+
+            lines = source.readlines()
+            for line in tqdm(lines, desc="Processing lines", unit="line"):
+                line = line.strip()
+                line = (
+                    line.replace("&", "&amp;")
+                    .replace('"', "&quot;")
+                    .replace("<", "&lt;")
+                    .replace(">", "&gt;")
+                )
+                if line == "":
+                    continue
+
+                line = line.split("#")[0].strip()
+                data = line.split("\t")
+                if line == "" or len(data) != 4:
+                    print(f"xmlform: Invalid line '{line}'")
+                    continue
+
+                if data[0] != form:
+                    form = data[0]
+                    if count > 0:
+                        out.write("  </i>\n</entry>\n")
+                    out.write(f'<entry k="{form}"')
+                    if args.desacc:
+                        out.write(f' desacc="{args.desacc}"')
+                    if args.entryop:
+                        out.write(f' op="{args.entryop}"')
+                    out.write(">\n")
+                    icount = 0
+                    count += 1
+
+                if icount == 0 or data[1] != lemma or data[2] != norm:
+                    lemma = data[1]
+                    norm = data[2]
+                    if icount > 0:
+                        out.write("  </i>\n")
+                    out.write("  <i")
+                    if lemma:
+                        out.write(f' l="{lemma}"')
+                    if norm:
+                        out.write(f' n="{norm}"')
+                    if args.lingop:
+                        out.write(f' op="{args.lingop}"')
+                    out.write(">\n")
+                    icount += 1
+
+                out.write(f'    <p v="{data[3]}"/>\n')
 
         if count > 0:
             out.write("  </i>\n</entry>\n")

From 21c087a67c51be2a2978f1c2034f2fe9b7ce9ca7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= <gael.de-chalendar@cea.fr>
Date: Thu, 23 May 2024 20:24:09 +0200
Subject: [PATCH 2/5] Correct writing on closed file

---
 lima_linguisticdata/scripts/xmlforms.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lima_linguisticdata/scripts/xmlforms.py b/lima_linguisticdata/scripts/xmlforms.py
index 7a6bf7674..03c56fe4e 100644
--- a/lima_linguisticdata/scripts/xmlforms.py
+++ b/lima_linguisticdata/scripts/xmlforms.py
@@ -115,8 +115,8 @@ def main():
 
                 out.write(f'    <p v="{data[3]}"/>\n')
 
-        if count > 0:
-            out.write("  </i>\n</entry>\n")
+            if count > 0:
+                out.write("  </i>\n</entry>\n")
 
 
 if __name__ == "__main__":

From daee28b48ef6318226d00c72cda6759642845c2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= <gael.de-chalendar@cea.fr>
Date: Thu, 23 May 2024 20:33:54 +0200
Subject: [PATCH 3/5] Add missing return in operator=

---
 .../common/PropertyCode/PropertyAccessor.cpp                     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyAccessor.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyAccessor.cpp
index ea5acddc0..26f73b74b 100644
--- a/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyAccessor.cpp
+++ b/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyAccessor.cpp
@@ -66,6 +66,7 @@ PropertyAccessorPrivate& PropertyAccessorPrivate::operator=(const PropertyAccess
   m_mask = pap.m_mask;
   m_emptyNessMask = pap.m_emptyNessMask;
   m_name = pap.m_name;
+  return *this;
 }
 
 

From 348c2376d9ffdf9ad1951a64d0e53ca61506eb61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= <gael.de-chalendar@cea.fr>
Date: Thu, 23 May 2024 23:03:26 +0200
Subject: [PATCH 4/5] Continue converting perl scripts to python

---
 .../cmake/LinguisticData.cmake                |   6 +-
 lima_linguisticdata/scripts/flex.pl           |  26 +-
 lima_linguisticdata/scripts/flex.py           | 237 ++++++++++++++++++
 .../scripts/pointvirgules2tabs.py             |  32 +++
 4 files changed, 280 insertions(+), 21 deletions(-)
 create mode 100644 lima_linguisticdata/scripts/flex.py
 create mode 100644 lima_linguisticdata/scripts/pointvirgules2tabs.py

diff --git a/lima_linguisticdata/cmake/LinguisticData.cmake b/lima_linguisticdata/cmake/LinguisticData.cmake
index 4cda79d1b..b6d90cbfc 100644
--- a/lima_linguisticdata/cmake/LinguisticData.cmake
+++ b/lima_linguisticdata/cmake/LinguisticData.cmake
@@ -73,7 +73,7 @@ endmacro (CODES _lang)
 macro (FLEXION _lang)
   add_custom_command(
     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/formes-${_lang}.txt
-    COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/flex.pl def.txt  mots-simples.txt ${CMAKE_CURRENT_BINARY_DIR} formes-${_lang}.txt exclude.txt
+    COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/flex.py def.txt mots-simples.txt ${CMAKE_CURRENT_BINARY_DIR} formes-${_lang}.txt --excludesfile exclude.txt
     DEPENDS def.txt  mots-simples.txt exclude.txt
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
     VERBATIM
@@ -96,9 +96,9 @@ macro(CONVERT _lang)
 
   add_custom_command(
     OUTPUT dicotabs.txt
-    COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.pl ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt
+    COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.py ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt
     DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt
-    COMMENT "perl ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.pl ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt"
+    COMMENT "python3 ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.py ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt"
     VERBATIM
   )
   add_custom_target(
diff --git a/lima_linguisticdata/scripts/flex.pl b/lima_linguisticdata/scripts/flex.pl
index a542180bb..35d47a230 100755
--- a/lima_linguisticdata/scripts/flex.pl
+++ b/lima_linguisticdata/scripts/flex.pl
@@ -108,30 +108,20 @@ sub traitement
 
     while($allDone==0) {
       $debug && print "subleminv='$subleminv'\n";
-    if (defined ($$models{$subleminv})) 
+      if (defined ($$models{$subleminv}))
       {
         $model = $$models{$subleminv};
         $commun = $subleminv;
         last;
       }
-      if(length($subleminv)>0) {
+      if(length($subleminv)>0)
+      {
             chop ($subleminv);
-	}
+      }
       else { $allDone=1; }
-  }
-
-#    foreach my $masque (@$modelstab)
-#    {
-#      $debug && print "masque='$masque'\n";
-#      if ($leminv=~/^$masque/)
-#      {
-#        $model = $$models{$masque};
-#        $commun = $masque;
-#        last;
-#      }
-#    }
+    }
 
-    if (!defined($model)) 
+    if (!defined($model))
     {
       print FILETRACE "Unable to handle line (no model found) $infileLineNum: $line\n";
       return;
@@ -247,12 +237,12 @@ sub loadData
       }
       push @{$$categsTypes{$categ}}, $type;
     }
-    my (@modelentries, %models, @modelstab);
+    my (%models, @modelstab);
 
     my $nbmodel = loadModels($ficmodel, \%models, \@modelstab);
     print "Got $nbmodel $type modelword\n";
 
-    my (@models,@third,@fourth,%direct,%modelsdata);
+    my (%modelsdata);
 
     my $nbtable = loadTable($type, $fictable, \%modelsdata);
     print "Got $nbtable $type table elems\n";
diff --git a/lima_linguisticdata/scripts/flex.py b/lima_linguisticdata/scripts/flex.py
new file mode 100644
index 000000000..b178be6de
--- /dev/null
+++ b/lima_linguisticdata/scripts/flex.py
@@ -0,0 +1,237 @@
+import argparse
+import codecs
+import os
+import sys
+
+from tqdm import tqdm
+
+output_file, filetrace = None, None
+excludes = {}
+
+
+def load_data(file, categs_types, struct):
+    with codecs.open(file, 'r', 'utf-8') as deffile:
+        for line_num, line in enumerate(deffile, 1):
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            if not line.count(';') == 3:
+                sys.exit(f"Malformed definition line {line_num} "
+                         f"in {file}: {line}")
+
+            type_, categs, ficmodel, fictable = line.split(';')
+            categ_list = categs.split(',')
+
+            for categ in categ_list:
+                if categ not in categs_types:
+                    categs_types[categ] = []
+                categs_types[categ].append(type_)
+
+            models, modelstab = {}, []
+            nbmodel = load_models(ficmodel, models, modelstab)
+            print(f"Got {nbmodel} {type_} modelword")
+
+            modelsdata = {}
+            nbtable = load_table(type_, fictable, modelsdata)
+            print(f"Got {nbtable} {type_} table elems")
+
+            struct[type_] = {"models": models,
+                             "modelsdata": modelsdata,
+                             "modelstab": modelstab}
+
+
+def load_models(dicfile, models, modelstab):
+    print(f"Loading {dicfile}")
+    with codecs.open(dicfile, 'r', 'utf-8') as filedic:
+        for line_num, line in enumerate(filedic, 1):
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            if not line.count(';') == 2:
+                sys.exit(f"Malformed model line {line_num} "
+                         f"in {dicfile}: {line}")
+
+            model, val, _ = line.split(';')
+            modelstab.append(model)
+            models[model] = val
+
+    return line_num
+
+
+def load_table(type_, tablefile, modelsdata):
+    print(f"Loading {tablefile}")
+    oldmodel, maps = "", []
+    ok = True
+    with codecs.open(tablefile, 'r', 'utf-8') as filetable:
+        for line_num, line in enumerate(filetable, 1):
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            parts = line.split(';')
+            if len(parts) < 3 or len(parts) > 4:
+                print(f"Malformed table line {line_num} "
+                      f"in {tablefile}: {line}",
+                      file=sys.stderr)
+                ok = False
+                continue
+            if type_ != "verbe" and len(parts) == 4:
+                print(f"Malformed non-verb table line {line_num} "
+                      f"in {tablefile} (should have 3 columns):\n{line}",
+                      file=sys.stderr)
+                ok = False
+                continue
+
+            modelword, formes, thirds = parts[0], parts[1], parts[2]
+            fourths = parts[3] if len(parts) == 4 else ""
+
+            if oldmodel and oldmodel != modelword:
+                modelsdata[oldmodel] = maps[:]
+                maps.clear()
+
+            oldmodel = modelword
+            maps.append({"forme": formes, "third": thirds, "fourth": fourths})
+
+        if oldmodel:
+            modelsdata[oldmodel] = maps[:]
+    if not ok:
+        sys.exit(f"Had problems reading {tablefile}")
+    return line_num
+
+
+def load_excludes(excludesfile, excludes):
+    print(f"Loading exclusions file {excludesfile}")
+    if os.path.isfile(excludesfile):
+        with codecs.open(excludesfile, 'r', 'utf-8') as excludefile:
+            for line in excludefile:
+                line = line.strip()
+                excludes[line] = True
+    else:
+        print(f"File {excludesfile} not found", file=sys.stderr)
+
+
+def racine(type_, model, lem, commun):
+    if type_ == "verbe":
+        llem, lmodel = len(lem) - 1, len(model) - 1
+        while llem >= 0 and lmodel >= 0 and lem[:llem] == model[:lmodel]:
+            llem -= 1
+            lmodel -= 1
+        return lem[:llem + 1], lmodel + 1
+
+    lfin = len(commun) - (1 if commun.endswith('$') else 0)
+    racine = lem[:len(lem) - lfin]
+    longueur = len(model) - lfin
+    if '^' in model:
+        longueur -= 1
+
+    return racine, longueur
+
+
+def traitement(struct, line, types, lem, leminv, categ, norm, filetrace):
+    if types is None:
+        print_output(f"{lem};{categ};;;{lem};{norm};")
+        return
+
+    for type_ in types:
+        models, modelsdata = (struct[type_]['models'],
+                              struct[type_]['modelsdata'])
+        subleminv = leminv + '$'
+        model, commun = None, None
+
+        while subleminv:
+            if subleminv in models:
+                model = models[subleminv]
+                commun = subleminv
+                break
+            subleminv = subleminv[:-1]
+
+        if model is None:
+            filetrace.write(f"Unable to handle line (no model found) {line}\n")
+            return
+
+        if model not in modelsdata:
+            filetrace.write(f"Unable to handle line "
+                            f"(no data for model {model}) {line}\n")
+            return
+
+        racine_val, longueur = racine(type_, model, lem, commun)
+        for map_ in modelsdata[model]:
+            forme1 = map_['forme']
+            third = map_['third'] if type_ == 'verbe' else map_['fourth']
+            fourth = map_['fourth'] if type_ == 'verbe' else map_['third']
+
+            forme_sortie = racine_val + forme1[longueur:]
+            if '/' in map_['forme']:
+                table = map_['forme'].split('/')
+                forme_sortie = racine_val[table[0]:table[0]+longueur]
+                outline = (f"{forme_sortie};{categ};{third}"
+                           f";{fourth};{lem};{norm};")
+                outline = outline.replace('?', '')
+                print_output(outline)
+                forme1 = table[1]
+
+            forme_sortie = racine_val[forme1:forme1+longueur]
+            outline = f"{forme_sortie};{categ};{third};{fourth};{lem};{norm};"
+            outline = outline.replace('?', '')
+            print_output(outline)
+
+
+def print_output(line):
+    global output_file, filetrace, excludes
+    if line not in excludes:
+        output_file.write(f"{line}\n")
+    else:
+        filetrace.write(f"Excluding line: {line}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process some files.")
+    parser.add_argument('deffile', help="The definition file")
+    parser.add_argument('infile', help="The input file")
+    parser.add_argument('workdir', help="The working directory")
+    parser.add_argument('outfile', help="The output file")
+    parser.add_argument('--excludesfile', help="The excludes file")
+
+    args = parser.parse_args()
+
+    global output_file, filetrace, excludes
+    output_file = codecs.open(
+      os.path.join(args.workdir, args.outfile), 'w', 'utf-8')
+    filetrace = codecs.open(
+      os.path.join(args.workdir, f"{args.infile}.log"), 'w', 'utf-8')
+
+    categs_types, struct = {}, {}
+
+    load_data(args.deffile, categs_types, struct)
+
+    if args.excludesfile:
+        load_excludes(args.excludesfile, excludes)
+    else:
+        print("no excludes file", file=sys.stderr)
+
+    print("Processing simple words file")
+    with codecs.open(args.infile, 'r', 'utf-8') as infile:
+        for line_num, line in enumerate(
+          tqdm(infile, desc="Flexing lemmas"), 1):
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            if len(line.split(';')) < 5:
+                filetrace.write(f"Malformed line {line_num}: {line}\n")
+                continue
+
+            parts = line.split(';')
+            lem, categ, norm = parts[2], parts[3], parts[4]
+            leminv = lem[::-1]
+
+            if sum(1 for char in lem if char.isupper()) >= 2:
+                print_output(f"{lem};{categ};;;{lem};{norm};")
+            else:
+                traitement(struct, line, categs_types.get(categ), lem,
+                           leminv, categ, norm, filetrace)
+
+    output_file.close()
+    filetrace.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lima_linguisticdata/scripts/pointvirgules2tabs.py b/lima_linguisticdata/scripts/pointvirgules2tabs.py
new file mode 100644
index 000000000..5b8442d16
--- /dev/null
+++ b/lima_linguisticdata/scripts/pointvirgules2tabs.py
@@ -0,0 +1,32 @@
+# Copyright 2002-2013 CEA LIST
+# SPDX-FileCopyrightText: 2022 CEA LIST <gael.de-chalendar@cea.fr>
+#
+# SPDX-License-Identifier: MIT
+
+import argparse
+
+
+def replace_semicolons_with_tabs(fin, fout):
+    with open(fin, 'r', encoding='utf-8') as infile:
+        content = infile.read()
+
+    # Replace semicolons with tabs
+    modified_content = content.replace(';', '\t')
+
+    with open(fout, 'w', encoding='utf-8') as outfile:
+        outfile.write(modified_content)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Replace all semicolons with tabulations in a file.")
+    parser.add_argument('fin', help="The input file")
+    parser.add_argument('fout', help="The output file")
+
+    args = parser.parse_args()
+
+    replace_semicolons_with_tabs(args.fin, args.fout)
+
+
+if __name__ == "__main__":
+    main()

From 03edf6741e99ae12c5275af02d771cc45fb795fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= <gael.de-chalendar@cea.fr>
Date: Fri, 24 May 2024 08:19:42 +0200
Subject: [PATCH 5/5] Rename a cmake module to match the find command

---
 CMakeLists.txt                                                 | 2 +-
 cmake/Modules/{FindQHttpServer.cmake => FindQHTTPSERVER.cmake} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cmake/Modules/{FindQHttpServer.cmake => FindQHTTPSERVER.cmake} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0ebe6725..f33470964 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -379,7 +379,7 @@ else (TRECPP_FOUND)
 endif (TRE_FOUND)
 
 # QHttpServer is necessary for limaserver HTTP server
-find_package(QHttpServer QUIET)
+find_package(QHTTPSERVER QUIET)
 if (NOT QHTTPSERVER_FOUND)
   message(STATUS "QHttpServer Not found. Lima HTTP server will NOT be built")
 else ()
diff --git a/cmake/Modules/FindQHttpServer.cmake b/cmake/Modules/FindQHTTPSERVER.cmake
similarity index 100%
rename from cmake/Modules/FindQHttpServer.cmake
rename to cmake/Modules/FindQHTTPSERVER.cmake