diff --git a/lima_linguisticdata/cmake/LinguisticData.cmake b/lima_linguisticdata/cmake/LinguisticData.cmake index 4cda79d1b..b6d90cbfc 100644 --- a/lima_linguisticdata/cmake/LinguisticData.cmake +++ b/lima_linguisticdata/cmake/LinguisticData.cmake @@ -73,7 +73,7 @@ endmacro (CODES _lang) macro (FLEXION _lang) add_custom_command( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/formes-${_lang}.txt - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/flex.pl def.txt mots-simples.txt ${CMAKE_CURRENT_BINARY_DIR} formes-${_lang}.txt exclude.txt + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/flex.py def.txt mots-simples.txt ${CMAKE_CURRENT_BINARY_DIR} formes-${_lang}.txt --excludesfile exclude.txt DEPENDS def.txt mots-simples.txt exclude.txt WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} VERBATIM @@ -96,9 +96,9 @@ macro(CONVERT _lang) add_custom_command( OUTPUT dicotabs.txt - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.pl ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.py ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt - COMMENT "perl ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.pl ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt" + COMMENT "python3 ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.py ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt" VERBATIM ) add_custom_target( diff --git a/lima_linguisticdata/scripts/flex.pl b/lima_linguisticdata/scripts/flex.pl index a542180bb..35d47a230 100755 --- a/lima_linguisticdata/scripts/flex.pl +++ b/lima_linguisticdata/scripts/flex.pl @@ -108,30 +108,20 @@ sub traitement while($allDone==0) { $debug && print "subleminv='$subleminv'\n"; - if (defined ($$models{$subleminv})) + if (defined ($$models{$subleminv})) { $model = $$models{$subleminv}; $commun = $subleminv; last; } - if(length($subleminv)>0) { + if(length($subleminv)>0) + { chop ($subleminv); - } + } else { $allDone=1; } - } - -# foreach my $masque (@$modelstab) -# { -# $debug && print "masque='$masque'\n"; -# if ($leminv=~/^$masque/) -# { -# $model = $$models{$masque}; -# $commun = $masque; -# last; -# } -# } + } - if (!defined($model)) + if (!defined($model)) { print FILETRACE "Unable to handle line (no model found) $infileLineNum: $line\n"; return; @@ -247,12 +237,12 @@ sub loadData } push @{$$categsTypes{$categ}}, $type; } - my (@modelentries, %models, @modelstab); + my (%models, @modelstab); my $nbmodel = loadModels($ficmodel, \%models, \@modelstab); print "Got $nbmodel $type modelword\n"; - my (@models,@third,@fourth,%direct,%modelsdata); + my (%modelsdata); my $nbtable = loadTable($type, $fictable, \%modelsdata); print "Got $nbtable $type table elems\n"; diff --git a/lima_linguisticdata/scripts/flex.py b/lima_linguisticdata/scripts/flex.py new file mode 100644 index 000000000..b178be6de --- /dev/null +++ b/lima_linguisticdata/scripts/flex.py @@ -0,0 +1,237 @@ +import argparse +import codecs +import os +import sys + +from tqdm import tqdm + +output_file, filetrace = None, None +excludes = {} + + +def load_data(file, categs_types, struct): + with codecs.open(file, 'r', 'utf-8') as deffile: + for line_num, line in enumerate(deffile, 1): + line = line.strip() + if not line or line.startswith('#'): + continue + if not line.count(';') == 3: + sys.exit(f"Malformed definition line {line_num} " + f"in {file}: {line}") + + type_, categs, ficmodel, fictable = line.split(';') + categ_list = categs.split(',') + + for categ in categ_list: + if categ not in categs_types: + categs_types[categ] = [] + categs_types[categ].append(type_) + + models, modelstab = {}, [] + nbmodel = load_models(ficmodel, models, modelstab) + print(f"Got {nbmodel} {type_} modelword") + + modelsdata = {} + nbtable = load_table(type_, fictable, modelsdata) + print(f"Got {nbtable} {type_} table elems") + + struct[type_] = {"models": models, + "modelsdata": modelsdata, + "modelstab": modelstab} + + +def load_models(dicfile, models, modelstab): + print(f"Loading {dicfile}") + with codecs.open(dicfile, 'r', 'utf-8') as filedic: + for line_num, line in enumerate(filedic, 1): + line = line.strip() + if not line or line.startswith('#'): + continue + if not line.count(';') == 2: + sys.exit(f"Malformed model line {line_num} " + f"in {dicfile}: {line}") + + model, val, _ = line.split(';') + modelstab.append(model) + models[model] = val + + return line_num + + +def load_table(type_, tablefile, modelsdata): + print(f"Loading {tablefile}") + oldmodel, maps = "", [] + ok = True + with codecs.open(tablefile, 'r', 'utf-8') as filetable: + for line_num, line in enumerate(filetable, 1): + line = line.strip() + if not line or line.startswith('#'): + continue + parts = line.split(';') + if len(parts) < 3 or len(parts) > 4: + print(f"Malformed table line {line_num} " + f"in {tablefile}: {line}", + file=sys.stderr) + ok = False + continue + if type_ != "verbe" and len(parts) == 4: + print(f"Malformed non-verb table line {line_num} " + f"in {tablefile} (should have 3 columns):\n{line}", + file=sys.stderr) + ok = False + continue + + modelword, formes, thirds = parts[0], parts[1], parts[2] + fourths = parts[3] if len(parts) == 4 else "" + + if oldmodel and oldmodel != modelword: + modelsdata[oldmodel] = maps[:] + maps.clear() + + oldmodel = modelword + maps.append({"forme": formes, "third": thirds, "fourth": fourths}) + + if oldmodel: + modelsdata[oldmodel] = maps[:] + if not ok: + sys.exit(f"Had problems reading {tablefile}") + return line_num + + +def load_excludes(excludesfile, excludes): + print(f"Loading exclusions file {excludesfile}") + if os.path.isfile(excludesfile): + with codecs.open(excludesfile, 'r', 'utf-8') as excludefile: + for line in excludefile: + line = line.strip() + excludes[line] = True + else: + print(f"File {excludesfile} not found", file=sys.stderr) + + +def racine(type_, model, lem, commun): + if type_ == "verbe": + llem, lmodel = len(lem) - 1, len(model) - 1 + while llem >= 0 and lmodel >= 0 and lem[:llem] == model[:lmodel]: + llem -= 1 + lmodel -= 1 + return lem[:llem + 1], lmodel + 1 + + lfin = len(commun) - (1 if commun.endswith('$') else 0) + racine = lem[:len(lem) - lfin] + longueur = len(model) - lfin + if '^' in model: + longueur -= 1 + + return racine, longueur + + +def traitement(struct, line, types, lem, leminv, categ, norm, filetrace): + if types is None: + print_output(f"{lem};{categ};;;{lem};{norm};") + return + + for type_ in types: + models, modelsdata = (struct[type_]['models'], + struct[type_]['modelsdata']) + subleminv = leminv + '$' + model, commun = None, None + + while subleminv: + if subleminv in models: + model = models[subleminv] + commun = subleminv + break + subleminv = subleminv[:-1] + + if model is None: + filetrace.write(f"Unable to handle line (no model found) {line}\n") + return + + if model not in modelsdata: + filetrace.write(f"Unable to handle line " + f"(no data for model {model}) {line}\n") + return + + racine_val, longueur = racine(type_, model, lem, commun) + for map_ in modelsdata[model]: + forme1 = map_['forme'] + third = map_['third'] if type_ == 'verbe' else map_['fourth'] + fourth = map_['fourth'] if type_ == 'verbe' else map_['third'] + + forme_sortie = racine_val + forme1[longueur:] + if '/' in map_['forme']: + table = map_['forme'].split('/') + forme_sortie = racine_val[table[0]:table[0]+longueur] + outline = (f"{forme_sortie};{categ};{third}" + f";{fourth};{lem};{norm};") + outline = outline.replace('?', '') + print_output(outline) + forme1 = table[1] + + forme_sortie = racine_val[forme1:forme1+longueur] + outline = f"{forme_sortie};{categ};{third};{fourth};{lem};{norm};" + outline = outline.replace('?', '') + print_output(outline) + + +def print_output(line): + global output_file, filetrace, excludes + if line not in excludes: + output_file.write(f"{line}\n") + else: + filetrace.write(f"Excluding line: {line}\n") + + +def main(): + parser = argparse.ArgumentParser(description="Process some files.") + parser.add_argument('deffile', help="The definition file") + parser.add_argument('infile', help="The input file") + parser.add_argument('workdir', help="The working directory") + parser.add_argument('outfile', help="The output file") + parser.add_argument('--excludesfile', help="The excludes file") + + args = parser.parse_args() + + global output_file, filetrace, excludes + output_file = codecs.open( + os.path.join(args.workdir, args.outfile), 'w', 'utf-8') + filetrace = codecs.open( + os.path.join(args.workdir, f"{args.infile}.log"), 'w', 'utf-8') + + categs_types, struct = {}, {} + + load_data(args.deffile, categs_types, struct) + + if args.excludesfile: + load_excludes(args.excludesfile, excludes) + else: + print("no excludes file", file=sys.stderr) + + print("Processing simple words file") + with codecs.open(args.infile, 'r', 'utf-8') as infile: + for line_num, line in enumerate( + tqdm(infile, desc="Flexing lemmas"), 1): + line = line.strip() + if not line or line.startswith('#'): + continue + if len(line.split(';')) < 5: + filetrace.write(f"Malformed line {line_num}: {line}\n") + continue + + parts = line.split(';') + lem, categ, norm = parts[2], parts[3], parts[4] + leminv = lem[::-1] + + if sum(1 for char in lem if char.isupper()) >= 2: + print_output(f"{lem};{categ};;;{lem};{norm};") + else: + traitement(struct, line, categs_types.get(categ), lem, + leminv, categ, norm, filetrace) + + output_file.close() + filetrace.close() + + +if __name__ == "__main__": + main() diff --git a/lima_linguisticdata/scripts/pointvirgules2tabs.py b/lima_linguisticdata/scripts/pointvirgules2tabs.py new file mode 100644 index 000000000..5b8442d16 --- /dev/null +++ b/lima_linguisticdata/scripts/pointvirgules2tabs.py @@ -0,0 +1,32 @@ +# Copyright 2002-2013 CEA LIST +# SPDX-FileCopyrightText: 2022 CEA LIST +# +# SPDX-License-Identifier: MIT + +import argparse + + +def replace_semicolons_with_tabs(fin, fout): + with open(fin, 'r', encoding='utf-8') as infile: + content = infile.read() + + # Replace semicolons with tabs + modified_content = content.replace(';', '\t') + + with open(fout, 'w', encoding='utf-8') as outfile: + outfile.write(modified_content) + + +def main(): + parser = argparse.ArgumentParser( + description="Replace all semicolons with tabulations in a file.") + parser.add_argument('fin', help="The input file") + parser.add_argument('fout', help="The output file") + + args = parser.parse_args() + + replace_semicolons_with_tabs(args.fin, args.fout) + + +if __name__ == "__main__": + main()