Skip to content

Commit

Permalink
Merge branch 'master' of github.com:aymara/lima
Browse files Browse the repository at this point in the history
  • Loading branch information
benlabbe committed May 23, 2024
2 parents b8e4b3f + 596f21e commit be6a87a
Show file tree
Hide file tree
Showing 8 changed files with 252 additions and 26 deletions.
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
repos:
- repo: https://github.com/ambv/black
rev: 23.7.0
hooks:
- id: black
language_version: python3.10
#- repo: https://github.com/ambv/black
# rev: 23.7.0
# hooks:
# - id: black
# language_version: python3.7
- repo: https://github.com/pycqa/flake8.git
rev: 3.9.1
hooks:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py eng forms.dic ${CMAKE_CURRENT_BINARY_DIR}/../../../disambiguisationMatrices/eng/corpus_eng_merge.txt forms.dic.2
#COMMAND LC_ALL="C" sort -u -o forms.dic.3 forms.dic.2
COMMAND LC_ALL="C" sort -u forms.dic.2 | LC_ALL="C" comm -23 - ${CMAKE_CURRENT_SOURCE_DIR}/to_ignore.dic > forms.dic.3
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/compound_entries.txt >> dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
Expand All @@ -40,10 +40,10 @@ else()
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py eng forms.dic ${CMAKE_CURRENT_BINARY_DIR}/../../../disambiguisationMatrices/eng/corpus_eng_merge.txt forms.dic.2
#COMMAND sort -u -o forms.dic.3 forms.dic.2
COMMAND sort -u forms.dic.2 | comm -23 - ${CMAKE_CURRENT_SOURCE_DIR}/to_ignore.dic > forms.dic.3
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/compound_entries.txt >> dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
COMMAND cat ${dicoFiles} > forms.dic
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py fre forms.dic ${PROJECT_SOURCE_DIR}/disambiguisationMatrices/fre/corpus/corpus_fre.txt forms.dic.2
COMMAND LC_ALL="C" sort -u -o forms.dic.3 forms.dic.2
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
Expand All @@ -37,9 +37,9 @@ else()
# here without clear error messages, verify that the sort command used is
# not the Windows one.
COMMAND sort -u -o forms.dic.3 forms.dic.2
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
COMMAND cat ${dicoFiles} > forms.dic
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py por forms.dic ${PROJECT_SOURCE_DIR}/disambiguisationMatrices/por/corpus/macmorpho.conll.txt forms.dic.2
COMMAND LC_ALL="C" sort -u -o forms.dic.3 forms.dic.2
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
Expand All @@ -36,10 +36,10 @@ else()
COMMAND cat ${dicoFiles} > forms.dic
COMMAND python ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py por forms.dic ${PROJECT_SOURCE_DIR}/disambiguisationMatrices/por/corpus/macmorpho.conll.txt forms.dic.2
COMMAND sort -u -o forms.dic.3 forms.dic.2
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
Expand Down
12 changes: 6 additions & 6 deletions lima_linguisticdata/cmake/LinguisticData.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,12 @@ macro(CONVERT _lang)
add_custom_command(
OUTPUT dico.xml
COMMAND echo "<dictionary>" > dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl dicocompletstd.txt dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py dicocompletstd.txt dico.xml.tmp
COMMAND bash -c "if [ -n \"${ARGN}\" ]; then cat ${ARGN} >> dico.xml.tmp; fi"
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/addnormfield.pl ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt > dicoponctu.norm.txt
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no dicoponctu.norm.txt dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no dicoponctu.norm.txt dico.xml.tmp
COMMAND echo "</dictionary>" >> dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS dicocompletstd.txt ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt
COMMENT "CONVERT ${_lang} produce XML dico"
Expand All @@ -175,12 +175,12 @@ macro(CONVERT _lang)
add_custom_command(
OUTPUT dico.xml
COMMAND echo ^<dictionary^> > dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl dicocompletstd.txt dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py dicocompletstd.txt dico.xml.tmp
COMMAND bash -c "if [ -n \"${ARGN}\" ]; then cat ${ARGN} >> dico.xml.tmp; fi"
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/addnormfield.pl ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt > dicoponctu.norm.txt
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no dicoponctu.norm.txt dico.xml.tmp
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no dicoponctu.norm.txt dico.xml.tmp
COMMAND echo ^</dictionary^> >> dico.xml.tmp
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS dicocompletstd.txt ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt
COMMENT "produce XML dico"
Expand Down
4 changes: 3 additions & 1 deletion lima_linguisticdata/rules-idiom/fre/examples/test-example.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ for example in $*; do
if [[ -a $exampleFile ]]; then
cp $exampleFile resources/idiom-examples

perl -pe 's%LinguisticProcessings/fre/idiomaticExpressions-fre.rules%idiom-examples/$ENV{"exampleFile"}%' $LIMA_CONF/lima-lp-tva-fre.xml > conf/lima-lp-tva-fre-example.xml;
python -c "import os, re; open('conf/lima-lp-tva-fre-example.xml', 'w', encoding='utf-8').write(re.sub(r'LinguisticProcessings/fre/idiomaticExpressions-fre.rules', f'idiom-examples/{os.getenv('exampleFile')}', open(os.path.join(os.getenv('LIMA_CONF'), 'lima-lp-tva-fre.xml'), 'r', encoding='utf-8').read()))"

# perl -pe 's%LinguisticProcessings/fre/idiomaticExpressions-fre.rules%idiom-examples/$ENV{"exampleFile"}%' $LIMA_CONF/lima-lp-tva-fre.xml > conf/lima-lp-tva-fre-example.xml;
echo "running test on $exampleFile";
tva --language=fre --resources-dir=resources --config-dir=conf --lp-config-file=lima-lp-tva-example.xml idiom-example-fre-test.xml >& tva-$example.log;
egrep "(TestReport|total)" tva-$example.log
Expand Down
99 changes: 99 additions & 0 deletions lima_linguisticdata/scripts/cmakeconvertdefautjys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python3

# Copyright 2002-2024 CEA LIST
# SPDX-FileCopyrightText: 2024 CEA LIST <[email protected]>
#
# SPDX-License-Identifier: MIT

###############################################
# Categories conversion program :
# converts dicostd into dicojys (data multiplicative coding)
###############################################

import argparse
import sys

from tqdm import tqdm


def main():
parser = argparse.ArgumentParser(description="Convert dicostd into dicojys.")
parser.add_argument("source", type=str, help="Source file")
parser.add_argument("convert", type=str, help="Convert file")
parser.add_argument("cible", type=str, help="Cible file")

args = parser.parse_args()

print("INFO : Start default categories conversion", file=sys.stderr)

try:
source = open(args.source, "r", encoding="utf-8")
except IOError:
print(f"Cannot open {args.source}", file=sys.stderr)
sys.exit(1)

try:
convert = open(args.convert, "r", encoding="utf-8")
except IOError:
print(f"Cannot open {args.convert}", file=sys.stderr)
sys.exit(1)

try:
cible = open(args.cible, "w", encoding="utf-8")
except IOError:
print(f"Cannot open {args.cible}", file=sys.stderr)
sys.exit(1)

try:
error = open("error.txt", "a", encoding="utf-8")
except IOError:
print("Cannot open error.txt", file=sys.stderr)
sys.exit(1)

tags = {}
for line in convert:
donneestags = line.strip().split(";")
if len(donneestags) > 1:
tags[donneestags[0]] = donneestags[1]

convert.close()

line_num = 0

source_lines = source.readlines()
for line in tqdm(source_lines, desc="Processing lines", unit="line"):
line_num += 1
line = line.strip()
if len(line) == 0 or line.startswith("#"):
continue

donnees = line.split("\t")
if len(donnees) != 2:
error_message = (
f"in file {args.source} line {line_num}: "
f"wrong number of columns. Ignore line: {line}\n"
)
print(error_message, file=sys.stderr)
error.write(error_message)
continue

type_ = donnees[0]
info = donnees[1]

if info in tags and tags[info]:
cible.write(f"{type_}\t{tags[info]}\n")
else:
error_message = (
f"in file {args.source} line {line_num}: "
f"Invalid properties {type_} {info}\n"
)
print(error_message, file=sys.stderr)
error.write(error_message)

source.close()
cible.close()
error.close()


if __name__ == "__main__":
main()
125 changes: 125 additions & 0 deletions lima_linguisticdata/scripts/xmlforms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/env python3

# Copyright 2002-2024 CEA LIST
# SPDX-FileCopyrightText: 2024 CEA LIST <[email protected]>
#
# SPDX-License-Identifier: MIT

import argparse
import codecs

from tqdm import tqdm


def print_usage():
print("USAGE : xmlforms [OPTIONS] inputfile outputfile")
print("where [OPTIONS] are :")
print(" -h or -help : print usage")
print(
" -desacc=[yes|no] : specify desacc attribute for entries. "
"default is none, that equals 'yes'"
)
print(
" -entryop=[add|replace|delete] : specify op attribute for entries. "
"default is none, that equals 'add'"
)
print(
" -lingop=[add|replace|delete] : specify op attribute for linginfos. "
"default is none, that equals 'add'"
)


def main():
parser = argparse.ArgumentParser(
description="Process input and output files with options."
)
parser.add_argument("inputfile", type=str, help="Input file")
parser.add_argument("outputfile", type=str, help="Output file")
parser.add_argument(
"-desacc",
type=str,
choices=["yes", "no"],
help="Specify desacc attribute for entries. "
'Default is none, that equals "yes"',
)
parser.add_argument(
"-entryop",
type=str,
choices=["add", "replace", "delete"],
help="Specify op attribute for entries. " 'Default is none, that equals "add"',
)
parser.add_argument(
"-lingop",
type=str,
choices=["add", "replace", "delete"],
help="Specify op attribute for linginfos. "
'Default is none, that equals "add"',
)

args = parser.parse_args()

with (
codecs.open(args.inputfile, "r", "utf-8") as source,
codecs.open(args.outputfile, "a", "utf-8") as out
):
form = ""
lemma = ""
norm = ""

count = 0
icount = 0

lines = source.readlines()
for line in tqdm(lines, desc="Processing lines", unit="line"):
line = line.strip()
line = (
line.replace("&", "&amp;")
.replace('"', "&quot;")
.replace("<", "&lt;")
.replace(">", "&gt;")
)
if line == "":
continue

line = line.split("#")[0].strip()
data = line.split("\t")
if line == "" or len(data) != 4:
print(f"xmlform: Invalid line '{line}'")
continue

if data[0] != form:
form = data[0]
if count > 0:
out.write(" </i>\n</entry>\n")
out.write(f'<entry k="{form}"')
if args.desacc:
out.write(f' desacc="{args.desacc}"')
if args.entryop:
out.write(f' op="{args.entryop}"')
out.write(">\n")
icount = 0
count += 1

if icount == 0 or data[1] != lemma or data[2] != norm:
lemma = data[1]
norm = data[2]
if icount > 0:
out.write(" </i>\n")
out.write(" <i")
if lemma:
out.write(f' l="{lemma}"')
if norm:
out.write(f' n="{norm}"')
if args.lingop:
out.write(f' op="{args.lingop}"')
out.write(">\n")
icount += 1

out.write(f' <p v="{data[3]}"/>\n')

if count > 0:
out.write(" </i>\n</entry>\n")


if __name__ == "__main__":
main()

0 comments on commit be6a87a

Please sign in to comment.