From 74bc5eafd190d8dc043646f34183fc243eb0085c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Thu, 23 May 2024 17:29:53 +0200 Subject: [PATCH 1/5] Correct with for python 3.7 --- lima_linguisticdata/scripts/xmlforms.py | 114 ++++++++++++------------ 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/lima_linguisticdata/scripts/xmlforms.py b/lima_linguisticdata/scripts/xmlforms.py index a49a7d4a2..7a6bf7674 100644 --- a/lima_linguisticdata/scripts/xmlforms.py +++ b/lima_linguisticdata/scripts/xmlforms.py @@ -58,64 +58,62 @@ def main(): args = parser.parse_args() - with ( - codecs.open(args.inputfile, "r", "utf-8") as source, - codecs.open(args.outputfile, "a", "utf-8") as out - ): - form = "" - lemma = "" - norm = "" - - count = 0 - icount = 0 - - lines = source.readlines() - for line in tqdm(lines, desc="Processing lines", unit="line"): - line = line.strip() - line = ( - line.replace("&", "&") - .replace('"', """) - .replace("<", "<") - .replace(">", ">") - ) - if line == "": - continue - - line = line.split("#")[0].strip() - data = line.split("\t") - if line == "" or len(data) != 4: - print(f"xmlform: Invalid line '{line}'") - continue - - if data[0] != form: - form = data[0] - if count > 0: - out.write(" \n\n") - out.write(f'\n") - icount = 0 - count += 1 - - if icount == 0 or data[1] != lemma or data[2] != norm: - lemma = data[1] - norm = data[2] - if icount > 0: - out.write(" \n") - out.write(" \n") - icount += 1 - - out.write(f'

\n') + with codecs.open(args.inputfile, "r", "utf-8") as source: + with codecs.open(args.outputfile, "a", "utf-8") as out: + form = "" + lemma = "" + norm = "" + + count = 0 + icount = 0 + + lines = source.readlines() + for line in tqdm(lines, desc="Processing lines", unit="line"): + line = line.strip() + line = ( + line.replace("&", "&") + .replace('"', """) + .replace("<", "<") + .replace(">", ">") + ) + if line == "": + continue + + line = line.split("#")[0].strip() + data = line.split("\t") + if line == "" or len(data) != 4: + print(f"xmlform: Invalid line '{line}'") + continue + + if data[0] != form: + form = data[0] + if count > 0: + out.write(" \n\n") + out.write(f'\n") + icount = 0 + count += 1 + + if icount == 0 or data[1] != lemma or data[2] != norm: + lemma = data[1] + norm = data[2] + if icount > 0: + out.write(" \n") + out.write(" \n") + icount += 1 + + out.write(f'

\n') if count > 0: out.write(" \n\n") From 21c087a67c51be2a2978f1c2034f2fe9b7ce9ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Thu, 23 May 2024 20:24:09 +0200 Subject: [PATCH 2/5] Correct writing on closed file --- lima_linguisticdata/scripts/xmlforms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lima_linguisticdata/scripts/xmlforms.py b/lima_linguisticdata/scripts/xmlforms.py index 7a6bf7674..03c56fe4e 100644 --- a/lima_linguisticdata/scripts/xmlforms.py +++ b/lima_linguisticdata/scripts/xmlforms.py @@ -115,8 +115,8 @@ def main(): out.write(f'

\n') - if count > 0: - out.write(" \n\n") + if count > 0: + out.write(" \n\n") if __name__ == "__main__": From daee28b48ef6318226d00c72cda6759642845c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Thu, 23 May 2024 20:33:54 +0200 Subject: [PATCH 3/5] Add missing return in operator= --- .../common/PropertyCode/PropertyAccessor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyAccessor.cpp b/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyAccessor.cpp index ea5acddc0..26f73b74b 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyAccessor.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/common/PropertyCode/PropertyAccessor.cpp @@ -66,6 +66,7 @@ PropertyAccessorPrivate& PropertyAccessorPrivate::operator=(const PropertyAccess m_mask = pap.m_mask; m_emptyNessMask = pap.m_emptyNessMask; m_name = pap.m_name; + return *this; } From 348c2376d9ffdf9ad1951a64d0e53ca61506eb61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Thu, 23 May 2024 23:03:26 +0200 Subject: [PATCH 4/5] Continue converting perl scripts to python --- .../cmake/LinguisticData.cmake | 6 +- lima_linguisticdata/scripts/flex.pl | 26 +- lima_linguisticdata/scripts/flex.py | 237 ++++++++++++++++++ .../scripts/pointvirgules2tabs.py | 32 +++ 4 files changed, 280 insertions(+), 21 deletions(-) create mode 100644 lima_linguisticdata/scripts/flex.py create mode 100644 lima_linguisticdata/scripts/pointvirgules2tabs.py diff --git a/lima_linguisticdata/cmake/LinguisticData.cmake b/lima_linguisticdata/cmake/LinguisticData.cmake index 4cda79d1b..b6d90cbfc 100644 --- a/lima_linguisticdata/cmake/LinguisticData.cmake +++ b/lima_linguisticdata/cmake/LinguisticData.cmake @@ -73,7 +73,7 @@ endmacro (CODES _lang) macro (FLEXION _lang) add_custom_command( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/formes-${_lang}.txt - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/flex.pl def.txt mots-simples.txt ${CMAKE_CURRENT_BINARY_DIR} formes-${_lang}.txt exclude.txt + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/flex.py def.txt mots-simples.txt ${CMAKE_CURRENT_BINARY_DIR} formes-${_lang}.txt --excludesfile exclude.txt DEPENDS def.txt mots-simples.txt exclude.txt WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} VERBATIM @@ -96,9 +96,9 @@ macro(CONVERT _lang) add_custom_command( OUTPUT dicotabs.txt - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.pl ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.py ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt - COMMENT "perl ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.pl ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt" + COMMENT "python3 ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.py ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt" VERBATIM ) add_custom_target( diff --git a/lima_linguisticdata/scripts/flex.pl b/lima_linguisticdata/scripts/flex.pl index a542180bb..35d47a230 100755 --- a/lima_linguisticdata/scripts/flex.pl +++ b/lima_linguisticdata/scripts/flex.pl @@ -108,30 +108,20 @@ sub traitement while($allDone==0) { $debug && print "subleminv='$subleminv'\n"; - if (defined ($$models{$subleminv})) + if (defined ($$models{$subleminv})) { $model = $$models{$subleminv}; $commun = $subleminv; last; } - if(length($subleminv)>0) { + if(length($subleminv)>0) + { chop ($subleminv); - } + } else { $allDone=1; } - } - -# foreach my $masque (@$modelstab) -# { -# $debug && print "masque='$masque'\n"; -# if ($leminv=~/^$masque/) -# { -# $model = $$models{$masque}; -# $commun = $masque; -# last; -# } -# } + } - if (!defined($model)) + if (!defined($model)) { print FILETRACE "Unable to handle line (no model found) $infileLineNum: $line\n"; return; @@ -247,12 +237,12 @@ sub loadData } push @{$$categsTypes{$categ}}, $type; } - my (@modelentries, %models, @modelstab); + my (%models, @modelstab); my $nbmodel = loadModels($ficmodel, \%models, \@modelstab); print "Got $nbmodel $type modelword\n"; - my (@models,@third,@fourth,%direct,%modelsdata); + my (%modelsdata); my $nbtable = loadTable($type, $fictable, \%modelsdata); print "Got $nbtable $type table elems\n"; diff --git a/lima_linguisticdata/scripts/flex.py b/lima_linguisticdata/scripts/flex.py new file mode 100644 index 000000000..b178be6de --- /dev/null +++ b/lima_linguisticdata/scripts/flex.py @@ -0,0 +1,237 @@ +import argparse +import codecs +import os +import sys + +from tqdm import tqdm + +output_file, filetrace = None, None +excludes = {} + + +def load_data(file, categs_types, struct): + with codecs.open(file, 'r', 'utf-8') as deffile: + for line_num, line in enumerate(deffile, 1): + line = line.strip() + if not line or line.startswith('#'): + continue + if not line.count(';') == 3: + sys.exit(f"Malformed definition line {line_num} " + f"in {file}: {line}") + + type_, categs, ficmodel, fictable = line.split(';') + categ_list = categs.split(',') + + for categ in categ_list: + if categ not in categs_types: + categs_types[categ] = [] + categs_types[categ].append(type_) + + models, modelstab = {}, [] + nbmodel = load_models(ficmodel, models, modelstab) + print(f"Got {nbmodel} {type_} modelword") + + modelsdata = {} + nbtable = load_table(type_, fictable, modelsdata) + print(f"Got {nbtable} {type_} table elems") + + struct[type_] = {"models": models, + "modelsdata": modelsdata, + "modelstab": modelstab} + + +def load_models(dicfile, models, modelstab): + print(f"Loading {dicfile}") + with codecs.open(dicfile, 'r', 'utf-8') as filedic: + for line_num, line in enumerate(filedic, 1): + line = line.strip() + if not line or line.startswith('#'): + continue + if not line.count(';') == 2: + sys.exit(f"Malformed model line {line_num} " + f"in {dicfile}: {line}") + + model, val, _ = line.split(';') + modelstab.append(model) + models[model] = val + + return line_num + + +def load_table(type_, tablefile, modelsdata): + print(f"Loading {tablefile}") + oldmodel, maps = "", [] + ok = True + with codecs.open(tablefile, 'r', 'utf-8') as filetable: + for line_num, line in enumerate(filetable, 1): + line = line.strip() + if not line or line.startswith('#'): + continue + parts = line.split(';') + if len(parts) < 3 or len(parts) > 4: + print(f"Malformed table line {line_num} " + f"in {tablefile}: {line}", + file=sys.stderr) + ok = False + continue + if type_ != "verbe" and len(parts) == 4: + print(f"Malformed non-verb table line {line_num} " + f"in {tablefile} (should have 3 columns):\n{line}", + file=sys.stderr) + ok = False + continue + + modelword, formes, thirds = parts[0], parts[1], parts[2] + fourths = parts[3] if len(parts) == 4 else "" + + if oldmodel and oldmodel != modelword: + modelsdata[oldmodel] = maps[:] + maps.clear() + + oldmodel = modelword + maps.append({"forme": formes, "third": thirds, "fourth": fourths}) + + if oldmodel: + modelsdata[oldmodel] = maps[:] + if not ok: + sys.exit(f"Had problems reading {tablefile}") + return line_num + + +def load_excludes(excludesfile, excludes): + print(f"Loading exclusions file {excludesfile}") + if os.path.isfile(excludesfile): + with codecs.open(excludesfile, 'r', 'utf-8') as excludefile: + for line in excludefile: + line = line.strip() + excludes[line] = True + else: + print(f"File {excludesfile} not found", file=sys.stderr) + + +def racine(type_, model, lem, commun): + if type_ == "verbe": + llem, lmodel = len(lem) - 1, len(model) - 1 + while llem >= 0 and lmodel >= 0 and lem[:llem] == model[:lmodel]: + llem -= 1 + lmodel -= 1 + return lem[:llem + 1], lmodel + 1 + + lfin = len(commun) - (1 if commun.endswith('$') else 0) + racine = lem[:len(lem) - lfin] + longueur = len(model) - lfin + if '^' in model: + longueur -= 1 + + return racine, longueur + + +def traitement(struct, line, types, lem, leminv, categ, norm, filetrace): + if types is None: + print_output(f"{lem};{categ};;;{lem};{norm};") + return + + for type_ in types: + models, modelsdata = (struct[type_]['models'], + struct[type_]['modelsdata']) + subleminv = leminv + '$' + model, commun = None, None + + while subleminv: + if subleminv in models: + model = models[subleminv] + commun = subleminv + break + subleminv = subleminv[:-1] + + if model is None: + filetrace.write(f"Unable to handle line (no model found) {line}\n") + return + + if model not in modelsdata: + filetrace.write(f"Unable to handle line " + f"(no data for model {model}) {line}\n") + return + + racine_val, longueur = racine(type_, model, lem, commun) + for map_ in modelsdata[model]: + forme1 = map_['forme'] + third = map_['third'] if type_ == 'verbe' else map_['fourth'] + fourth = map_['fourth'] if type_ == 'verbe' else map_['third'] + + forme_sortie = racine_val + forme1[longueur:] + if '/' in map_['forme']: + table = map_['forme'].split('/') + forme_sortie = racine_val[table[0]:table[0]+longueur] + outline = (f"{forme_sortie};{categ};{third}" + f";{fourth};{lem};{norm};") + outline = outline.replace('?', '') + print_output(outline) + forme1 = table[1] + + forme_sortie = racine_val[forme1:forme1+longueur] + outline = f"{forme_sortie};{categ};{third};{fourth};{lem};{norm};" + outline = outline.replace('?', '') + print_output(outline) + + +def print_output(line): + global output_file, filetrace, excludes + if line not in excludes: + output_file.write(f"{line}\n") + else: + filetrace.write(f"Excluding line: {line}\n") + + +def main(): + parser = argparse.ArgumentParser(description="Process some files.") + parser.add_argument('deffile', help="The definition file") + parser.add_argument('infile', help="The input file") + parser.add_argument('workdir', help="The working directory") + parser.add_argument('outfile', help="The output file") + parser.add_argument('--excludesfile', help="The excludes file") + + args = parser.parse_args() + + global output_file, filetrace, excludes + output_file = codecs.open( + os.path.join(args.workdir, args.outfile), 'w', 'utf-8') + filetrace = codecs.open( + os.path.join(args.workdir, f"{args.infile}.log"), 'w', 'utf-8') + + categs_types, struct = {}, {} + + load_data(args.deffile, categs_types, struct) + + if args.excludesfile: + load_excludes(args.excludesfile, excludes) + else: + print("no excludes file", file=sys.stderr) + + print("Processing simple words file") + with codecs.open(args.infile, 'r', 'utf-8') as infile: + for line_num, line in enumerate( + tqdm(infile, desc="Flexing lemmas"), 1): + line = line.strip() + if not line or line.startswith('#'): + continue + if len(line.split(';')) < 5: + filetrace.write(f"Malformed line {line_num}: {line}\n") + continue + + parts = line.split(';') + lem, categ, norm = parts[2], parts[3], parts[4] + leminv = lem[::-1] + + if sum(1 for char in lem if char.isupper()) >= 2: + print_output(f"{lem};{categ};;;{lem};{norm};") + else: + traitement(struct, line, categs_types.get(categ), lem, + leminv, categ, norm, filetrace) + + output_file.close() + filetrace.close() + + +if __name__ == "__main__": + main() diff --git a/lima_linguisticdata/scripts/pointvirgules2tabs.py b/lima_linguisticdata/scripts/pointvirgules2tabs.py new file mode 100644 index 000000000..5b8442d16 --- /dev/null +++ b/lima_linguisticdata/scripts/pointvirgules2tabs.py @@ -0,0 +1,32 @@ +# Copyright 2002-2013 CEA LIST +# SPDX-FileCopyrightText: 2022 CEA LIST +# +# SPDX-License-Identifier: MIT + +import argparse + + +def replace_semicolons_with_tabs(fin, fout): + with open(fin, 'r', encoding='utf-8') as infile: + content = infile.read() + + # Replace semicolons with tabs + modified_content = content.replace(';', '\t') + + with open(fout, 'w', encoding='utf-8') as outfile: + outfile.write(modified_content) + + +def main(): + parser = argparse.ArgumentParser( + description="Replace all semicolons with tabulations in a file.") + parser.add_argument('fin', help="The input file") + parser.add_argument('fout', help="The output file") + + args = parser.parse_args() + + replace_semicolons_with_tabs(args.fin, args.fout) + + +if __name__ == "__main__": + main() From 03edf6741e99ae12c5275af02d771cc45fb795fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl=20de=20Chalendar?= Date: Fri, 24 May 2024 08:19:42 +0200 Subject: [PATCH 5/5] Rename a cmake module to match the find command --- CMakeLists.txt | 2 +- cmake/Modules/{FindQHttpServer.cmake => FindQHTTPSERVER.cmake} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cmake/Modules/{FindQHttpServer.cmake => FindQHTTPSERVER.cmake} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index a0ebe6725..f33470964 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -379,7 +379,7 @@ else (TRECPP_FOUND) endif (TRE_FOUND) # QHttpServer is necessary for limaserver HTTP server -find_package(QHttpServer QUIET) +find_package(QHTTPSERVER QUIET) if (NOT QHTTPSERVER_FOUND) message(STATUS "QHttpServer Not found. Lima HTTP server will NOT be built") else () diff --git a/cmake/Modules/FindQHttpServer.cmake b/cmake/Modules/FindQHTTPSERVER.cmake similarity index 100% rename from cmake/Modules/FindQHttpServer.cmake rename to cmake/Modules/FindQHTTPSERVER.cmake