Skip to content

Commit

Permalink
Merge branch 'master' of github.com:aymara/lima
Browse files Browse the repository at this point in the history
  • Loading branch information
benlabbe committed May 24, 2024
2 parents be6a87a + 03edf67 commit 2b41f24
Show file tree
Hide file tree
Showing 8 changed files with 341 additions and 83 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ else (TRECPP_FOUND)
endif (TRE_FOUND)

# QHttpServer is necessary for limaserver HTTP server
find_package(QHttpServer QUIET)
find_package(QHTTPSERVER QUIET)
if (NOT QHTTPSERVER_FOUND)
message(STATUS "QHttpServer Not found. Lima HTTP server will NOT be built")
else ()
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions lima_linguisticdata/cmake/LinguisticData.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ endmacro (CODES _lang)
macro (FLEXION _lang)
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/formes-${_lang}.txt
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/flex.pl def.txt mots-simples.txt ${CMAKE_CURRENT_BINARY_DIR} formes-${_lang}.txt exclude.txt
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/flex.py def.txt mots-simples.txt ${CMAKE_CURRENT_BINARY_DIR} formes-${_lang}.txt --excludesfile exclude.txt
DEPENDS def.txt mots-simples.txt exclude.txt
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
VERBATIM
Expand All @@ -96,9 +96,9 @@ macro(CONVERT _lang)

add_custom_command(
OUTPUT dicotabs.txt
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.pl ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.py ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt
COMMENT "perl ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.pl ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt"
COMMENT "python3 ${PROJECT_SOURCE_DIR}/scripts/pointvirgules2tabs.py ${CMAKE_CURRENT_BINARY_DIR}/../flex/formes-${_lang}.txt dicotabs.txt"
VERBATIM
)
add_custom_target(
Expand Down
26 changes: 8 additions & 18 deletions lima_linguisticdata/scripts/flex.pl
Original file line number Diff line number Diff line change
Expand Up @@ -108,30 +108,20 @@ sub traitement

while($allDone==0) {
$debug && print "subleminv='$subleminv'\n";
if (defined ($$models{$subleminv}))
if (defined ($$models{$subleminv}))
{
$model = $$models{$subleminv};
$commun = $subleminv;
last;
}
if(length($subleminv)>0) {
if(length($subleminv)>0)
{
chop ($subleminv);
}
}
else { $allDone=1; }
}

# foreach my $masque (@$modelstab)
# {
# $debug && print "masque='$masque'\n";
# if ($leminv=~/^$masque/)
# {
# $model = $$models{$masque};
# $commun = $masque;
# last;
# }
# }
}

if (!defined($model))
if (!defined($model))
{
print FILETRACE "Unable to handle line (no model found) $infileLineNum: $line\n";
return;
Expand Down Expand Up @@ -247,12 +237,12 @@ sub loadData
}
push @{$$categsTypes{$categ}}, $type;
}
my (@modelentries, %models, @modelstab);
my (%models, @modelstab);

my $nbmodel = loadModels($ficmodel, \%models, \@modelstab);
print "Got $nbmodel $type modelword\n";

my (@models,@third,@fourth,%direct,%modelsdata);
my (%modelsdata);

my $nbtable = loadTable($type, $fictable, \%modelsdata);
print "Got $nbtable $type table elems\n";
Expand Down
237 changes: 237 additions & 0 deletions lima_linguisticdata/scripts/flex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
import argparse
import codecs
import os
import sys

from tqdm import tqdm

output_file, filetrace = None, None
excludes = {}


def load_data(file, categs_types, struct):
with codecs.open(file, 'r', 'utf-8') as deffile:
for line_num, line in enumerate(deffile, 1):
line = line.strip()
if not line or line.startswith('#'):
continue
if not line.count(';') == 3:
sys.exit(f"Malformed definition line {line_num} "
f"in {file}: {line}")

type_, categs, ficmodel, fictable = line.split(';')
categ_list = categs.split(',')

for categ in categ_list:
if categ not in categs_types:
categs_types[categ] = []
categs_types[categ].append(type_)

models, modelstab = {}, []
nbmodel = load_models(ficmodel, models, modelstab)
print(f"Got {nbmodel} {type_} modelword")

modelsdata = {}
nbtable = load_table(type_, fictable, modelsdata)
print(f"Got {nbtable} {type_} table elems")

struct[type_] = {"models": models,
"modelsdata": modelsdata,
"modelstab": modelstab}


def load_models(dicfile, models, modelstab):
print(f"Loading {dicfile}")
with codecs.open(dicfile, 'r', 'utf-8') as filedic:
for line_num, line in enumerate(filedic, 1):
line = line.strip()
if not line or line.startswith('#'):
continue
if not line.count(';') == 2:
sys.exit(f"Malformed model line {line_num} "
f"in {dicfile}: {line}")

model, val, _ = line.split(';')
modelstab.append(model)
models[model] = val

return line_num


def load_table(type_, tablefile, modelsdata):
print(f"Loading {tablefile}")
oldmodel, maps = "", []
ok = True
with codecs.open(tablefile, 'r', 'utf-8') as filetable:
for line_num, line in enumerate(filetable, 1):
line = line.strip()
if not line or line.startswith('#'):
continue
parts = line.split(';')
if len(parts) < 3 or len(parts) > 4:
print(f"Malformed table line {line_num} "
f"in {tablefile}: {line}",
file=sys.stderr)
ok = False
continue
if type_ != "verbe" and len(parts) == 4:
print(f"Malformed non-verb table line {line_num} "
f"in {tablefile} (should have 3 columns):\n{line}",
file=sys.stderr)
ok = False
continue

modelword, formes, thirds = parts[0], parts[1], parts[2]
fourths = parts[3] if len(parts) == 4 else ""

if oldmodel and oldmodel != modelword:
modelsdata[oldmodel] = maps[:]
maps.clear()

oldmodel = modelword
maps.append({"forme": formes, "third": thirds, "fourth": fourths})

if oldmodel:
modelsdata[oldmodel] = maps[:]
if not ok:
sys.exit(f"Had problems reading {tablefile}")
return line_num


def load_excludes(excludesfile, excludes):
print(f"Loading exclusions file {excludesfile}")
if os.path.isfile(excludesfile):
with codecs.open(excludesfile, 'r', 'utf-8') as excludefile:
for line in excludefile:
line = line.strip()
excludes[line] = True
else:
print(f"File {excludesfile} not found", file=sys.stderr)


def racine(type_, model, lem, commun):
if type_ == "verbe":
llem, lmodel = len(lem) - 1, len(model) - 1
while llem >= 0 and lmodel >= 0 and lem[:llem] == model[:lmodel]:
llem -= 1
lmodel -= 1
return lem[:llem + 1], lmodel + 1

lfin = len(commun) - (1 if commun.endswith('$') else 0)
racine = lem[:len(lem) - lfin]
longueur = len(model) - lfin
if '^' in model:
longueur -= 1

return racine, longueur


def traitement(struct, line, types, lem, leminv, categ, norm, filetrace):
if types is None:
print_output(f"{lem};{categ};;;{lem};{norm};")
return

for type_ in types:
models, modelsdata = (struct[type_]['models'],
struct[type_]['modelsdata'])
subleminv = leminv + '$'
model, commun = None, None

while subleminv:
if subleminv in models:
model = models[subleminv]
commun = subleminv
break
subleminv = subleminv[:-1]

if model is None:
filetrace.write(f"Unable to handle line (no model found) {line}\n")
return

if model not in modelsdata:
filetrace.write(f"Unable to handle line "
f"(no data for model {model}) {line}\n")
return

racine_val, longueur = racine(type_, model, lem, commun)
for map_ in modelsdata[model]:
forme1 = map_['forme']
third = map_['third'] if type_ == 'verbe' else map_['fourth']
fourth = map_['fourth'] if type_ == 'verbe' else map_['third']

forme_sortie = racine_val + forme1[longueur:]
if '/' in map_['forme']:
table = map_['forme'].split('/')
forme_sortie = racine_val[table[0]:table[0]+longueur]
outline = (f"{forme_sortie};{categ};{third}"
f";{fourth};{lem};{norm};")
outline = outline.replace('?', '')
print_output(outline)
forme1 = table[1]

forme_sortie = racine_val[forme1:forme1+longueur]
outline = f"{forme_sortie};{categ};{third};{fourth};{lem};{norm};"
outline = outline.replace('?', '')
print_output(outline)


def print_output(line):
global output_file, filetrace, excludes
if line not in excludes:
output_file.write(f"{line}\n")
else:
filetrace.write(f"Excluding line: {line}\n")


def main():
parser = argparse.ArgumentParser(description="Process some files.")
parser.add_argument('deffile', help="The definition file")
parser.add_argument('infile', help="The input file")
parser.add_argument('workdir', help="The working directory")
parser.add_argument('outfile', help="The output file")
parser.add_argument('--excludesfile', help="The excludes file")

args = parser.parse_args()

global output_file, filetrace, excludes
output_file = codecs.open(
os.path.join(args.workdir, args.outfile), 'w', 'utf-8')
filetrace = codecs.open(
os.path.join(args.workdir, f"{args.infile}.log"), 'w', 'utf-8')

categs_types, struct = {}, {}

load_data(args.deffile, categs_types, struct)

if args.excludesfile:
load_excludes(args.excludesfile, excludes)
else:
print("no excludes file", file=sys.stderr)

print("Processing simple words file")
with codecs.open(args.infile, 'r', 'utf-8') as infile:
for line_num, line in enumerate(
tqdm(infile, desc="Flexing lemmas"), 1):
line = line.strip()
if not line or line.startswith('#'):
continue
if len(line.split(';')) < 5:
filetrace.write(f"Malformed line {line_num}: {line}\n")
continue

parts = line.split(';')
lem, categ, norm = parts[2], parts[3], parts[4]
leminv = lem[::-1]

if sum(1 for char in lem if char.isupper()) >= 2:
print_output(f"{lem};{categ};;;{lem};{norm};")
else:
traitement(struct, line, categs_types.get(categ), lem,
leminv, categ, norm, filetrace)

output_file.close()
filetrace.close()


if __name__ == "__main__":
main()
32 changes: 32 additions & 0 deletions lima_linguisticdata/scripts/pointvirgules2tabs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright 2002-2013 CEA LIST
# SPDX-FileCopyrightText: 2022 CEA LIST <[email protected]>
#
# SPDX-License-Identifier: MIT

import argparse


def replace_semicolons_with_tabs(fin, fout):
with open(fin, 'r', encoding='utf-8') as infile:
content = infile.read()

# Replace semicolons with tabs
modified_content = content.replace(';', '\t')

with open(fout, 'w', encoding='utf-8') as outfile:
outfile.write(modified_content)


def main():
parser = argparse.ArgumentParser(
description="Replace all semicolons with tabulations in a file.")
parser.add_argument('fin', help="The input file")
parser.add_argument('fout', help="The output file")

args = parser.parse_args()

replace_semicolons_with_tabs(args.fin, args.fout)


if __name__ == "__main__":
main()
Loading

0 comments on commit 2b41f24

Please sign in to comment.