diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2da781464..7c662dbfa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,9 @@ repos: -- repo: https://github.com/ambv/black - rev: 23.7.0 - hooks: - - id: black - language_version: python3.10 + #- repo: https://github.com/ambv/black + # rev: 23.7.0 + # hooks: + # - id: black + # language_version: python3.7 - repo: https://github.com/pycqa/flake8.git rev: 3.9.1 hooks: diff --git a/lima_linguisticdata/analysisDictionary/eng/convert/CMakeLists.txt b/lima_linguisticdata/analysisDictionary/eng/convert/CMakeLists.txt index 04ba6cb36..5049f9d65 100644 --- a/lima_linguisticdata/analysisDictionary/eng/convert/CMakeLists.txt +++ b/lima_linguisticdata/analysisDictionary/eng/convert/CMakeLists.txt @@ -17,10 +17,10 @@ if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py eng forms.dic ${CMAKE_CURRENT_BINARY_DIR}/../../../disambiguisationMatrices/eng/corpus_eng_merge.txt forms.dic.2 #COMMAND LC_ALL="C" sort -u -o forms.dic.3 forms.dic.2 COMMAND LC_ALL="C" sort -u forms.dic.2 | LC_ALL="C" comm -23 - ${CMAKE_CURRENT_SOURCE_DIR}/to_ignore.dic > forms.dic.3 - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/compound_entries.txt >> dico.xml.tmp COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat COMMAND mv dico.xml.tmp dico.xml DEPENDS ${dicoFiles} @@ -40,10 +40,10 @@ else() COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py eng forms.dic ${CMAKE_CURRENT_BINARY_DIR}/../../../disambiguisationMatrices/eng/corpus_eng_merge.txt forms.dic.2 #COMMAND sort -u -o forms.dic.3 forms.dic.2 COMMAND sort -u forms.dic.2 | comm -23 - ${CMAKE_CURRENT_SOURCE_DIR}/to_ignore.dic > forms.dic.3 - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/compound_entries.txt >> dico.xml.tmp COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat COMMAND mv dico.xml.tmp dico.xml DEPENDS ${dicoFiles} diff --git a/lima_linguisticdata/analysisDictionary/fre/convert/CMakeLists.txt b/lima_linguisticdata/analysisDictionary/fre/convert/CMakeLists.txt index f509cad62..640f77cae 100644 --- a/lima_linguisticdata/analysisDictionary/fre/convert/CMakeLists.txt +++ b/lima_linguisticdata/analysisDictionary/fre/convert/CMakeLists.txt @@ -14,9 +14,9 @@ if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") COMMAND cat ${dicoFiles} > forms.dic COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py fre forms.dic ${PROJECT_SOURCE_DIR}/disambiguisationMatrices/fre/corpus/corpus_fre.txt forms.dic.2 COMMAND LC_ALL="C" sort -u -o forms.dic.3 forms.dic.2 - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat COMMAND mv dico.xml.tmp dico.xml DEPENDS ${dicoFiles} @@ -37,9 +37,9 @@ else() # here without clear error messages, verify that the sort command used is # not the Windows one. COMMAND sort -u -o forms.dic.3 forms.dic.2 - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat COMMAND mv dico.xml.tmp dico.xml DEPENDS ${dicoFiles} diff --git a/lima_linguisticdata/analysisDictionary/por/convert/CMakeLists.txt b/lima_linguisticdata/analysisDictionary/por/convert/CMakeLists.txt index a02f2487d..bd75fe334 100644 --- a/lima_linguisticdata/analysisDictionary/por/convert/CMakeLists.txt +++ b/lima_linguisticdata/analysisDictionary/por/convert/CMakeLists.txt @@ -14,10 +14,10 @@ if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") COMMAND cat ${dicoFiles} > forms.dic COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py por forms.dic ${PROJECT_SOURCE_DIR}/disambiguisationMatrices/por/corpus/macmorpho.conll.txt forms.dic.2 COMMAND LC_ALL="C" sort -u -o forms.dic.3 forms.dic.2 - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat COMMAND mv dico.xml.tmp dico.xml DEPENDS ${dicoFiles} @@ -36,10 +36,10 @@ else() COMMAND cat ${dicoFiles} > forms.dic COMMAND python ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py por forms.dic ${PROJECT_SOURCE_DIR}/disambiguisationMatrices/por/corpus/macmorpho.conll.txt forms.dic.2 COMMAND sort -u -o forms.dic.3 forms.dic.2 - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat COMMAND mv dico.xml.tmp dico.xml DEPENDS ${dicoFiles} diff --git a/lima_linguisticdata/cmake/LinguisticData.cmake b/lima_linguisticdata/cmake/LinguisticData.cmake index e4a3e2132..4cda79d1b 100644 --- a/lima_linguisticdata/cmake/LinguisticData.cmake +++ b/lima_linguisticdata/cmake/LinguisticData.cmake @@ -159,12 +159,12 @@ macro(CONVERT _lang) add_custom_command( OUTPUT dico.xml COMMAND echo "" > dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl dicocompletstd.txt dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py dicocompletstd.txt dico.xml.tmp COMMAND bash -c "if [ -n \"${ARGN}\" ]; then cat ${ARGN} >> dico.xml.tmp; fi" COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/addnormfield.pl ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt > dicoponctu.norm.txt - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no dicoponctu.norm.txt dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no dicoponctu.norm.txt dico.xml.tmp COMMAND echo "" >> dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat COMMAND mv dico.xml.tmp dico.xml DEPENDS dicocompletstd.txt ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt COMMENT "CONVERT ${_lang} produce XML dico" @@ -175,12 +175,12 @@ macro(CONVERT _lang) add_custom_command( OUTPUT dico.xml COMMAND echo ^ > dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl dicocompletstd.txt dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py dicocompletstd.txt dico.xml.tmp COMMAND bash -c "if [ -n \"${ARGN}\" ]; then cat ${ARGN} >> dico.xml.tmp; fi" COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/addnormfield.pl ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt > dicoponctu.norm.txt - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no dicoponctu.norm.txt dico.xml.tmp + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no dicoponctu.norm.txt dico.xml.tmp COMMAND echo ^ >> dico.xml.tmp - COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat + COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat COMMAND mv dico.xml.tmp dico.xml DEPENDS dicocompletstd.txt ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt COMMENT "produce XML dico" diff --git a/lima_linguisticdata/rules-idiom/fre/examples/test-example.sh b/lima_linguisticdata/rules-idiom/fre/examples/test-example.sh index f4db98f33..c88200202 100755 --- a/lima_linguisticdata/rules-idiom/fre/examples/test-example.sh +++ b/lima_linguisticdata/rules-idiom/fre/examples/test-example.sh @@ -25,7 +25,9 @@ for example in $*; do if [[ -a $exampleFile ]]; then cp $exampleFile resources/idiom-examples - perl -pe 's%LinguisticProcessings/fre/idiomaticExpressions-fre.rules%idiom-examples/$ENV{"exampleFile"}%' $LIMA_CONF/lima-lp-tva-fre.xml > conf/lima-lp-tva-fre-example.xml; + python -c "import os, re; open('conf/lima-lp-tva-fre-example.xml', 'w', encoding='utf-8').write(re.sub(r'LinguisticProcessings/fre/idiomaticExpressions-fre.rules', f'idiom-examples/{os.getenv('exampleFile')}', open(os.path.join(os.getenv('LIMA_CONF'), 'lima-lp-tva-fre.xml'), 'r', encoding='utf-8').read()))" + +# perl -pe 's%LinguisticProcessings/fre/idiomaticExpressions-fre.rules%idiom-examples/$ENV{"exampleFile"}%' $LIMA_CONF/lima-lp-tva-fre.xml > conf/lima-lp-tva-fre-example.xml; echo "running test on $exampleFile"; tva --language=fre --resources-dir=resources --config-dir=conf --lp-config-file=lima-lp-tva-example.xml idiom-example-fre-test.xml >& tva-$example.log; egrep "(TestReport|total)" tva-$example.log diff --git a/lima_linguisticdata/scripts/cmakeconvertdefautjys.py b/lima_linguisticdata/scripts/cmakeconvertdefautjys.py new file mode 100644 index 000000000..229dff0d4 --- /dev/null +++ b/lima_linguisticdata/scripts/cmakeconvertdefautjys.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +# Copyright 2002-2024 CEA LIST +# SPDX-FileCopyrightText: 2024 CEA LIST +# +# SPDX-License-Identifier: MIT + +############################################### +# Categories conversion program : +# converts dicostd into dicojys (data multiplicative coding) +############################################### + +import argparse +import sys + +from tqdm import tqdm + + +def main(): + parser = argparse.ArgumentParser(description="Convert dicostd into dicojys.") + parser.add_argument("source", type=str, help="Source file") + parser.add_argument("convert", type=str, help="Convert file") + parser.add_argument("cible", type=str, help="Cible file") + + args = parser.parse_args() + + print("INFO : Start default categories conversion", file=sys.stderr) + + try: + source = open(args.source, "r", encoding="utf-8") + except IOError: + print(f"Cannot open {args.source}", file=sys.stderr) + sys.exit(1) + + try: + convert = open(args.convert, "r", encoding="utf-8") + except IOError: + print(f"Cannot open {args.convert}", file=sys.stderr) + sys.exit(1) + + try: + cible = open(args.cible, "w", encoding="utf-8") + except IOError: + print(f"Cannot open {args.cible}", file=sys.stderr) + sys.exit(1) + + try: + error = open("error.txt", "a", encoding="utf-8") + except IOError: + print("Cannot open error.txt", file=sys.stderr) + sys.exit(1) + + tags = {} + for line in convert: + donneestags = line.strip().split(";") + if len(donneestags) > 1: + tags[donneestags[0]] = donneestags[1] + + convert.close() + + line_num = 0 + + source_lines = source.readlines() + for line in tqdm(source_lines, desc="Processing lines", unit="line"): + line_num += 1 + line = line.strip() + if len(line) == 0 or line.startswith("#"): + continue + + donnees = line.split("\t") + if len(donnees) != 2: + error_message = ( + f"in file {args.source} line {line_num}: " + f"wrong number of columns. Ignore line: {line}\n" + ) + print(error_message, file=sys.stderr) + error.write(error_message) + continue + + type_ = donnees[0] + info = donnees[1] + + if info in tags and tags[info]: + cible.write(f"{type_}\t{tags[info]}\n") + else: + error_message = ( + f"in file {args.source} line {line_num}: " + f"Invalid properties {type_} {info}\n" + ) + print(error_message, file=sys.stderr) + error.write(error_message) + + source.close() + cible.close() + error.close() + + +if __name__ == "__main__": + main() diff --git a/lima_linguisticdata/scripts/xmlforms.py b/lima_linguisticdata/scripts/xmlforms.py new file mode 100644 index 000000000..a49a7d4a2 --- /dev/null +++ b/lima_linguisticdata/scripts/xmlforms.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 + +# Copyright 2002-2024 CEA LIST +# SPDX-FileCopyrightText: 2024 CEA LIST +# +# SPDX-License-Identifier: MIT + +import argparse +import codecs + +from tqdm import tqdm + + +def print_usage(): + print("USAGE : xmlforms [OPTIONS] inputfile outputfile") + print("where [OPTIONS] are :") + print(" -h or -help : print usage") + print( + " -desacc=[yes|no] : specify desacc attribute for entries. " + "default is none, that equals 'yes'" + ) + print( + " -entryop=[add|replace|delete] : specify op attribute for entries. " + "default is none, that equals 'add'" + ) + print( + " -lingop=[add|replace|delete] : specify op attribute for linginfos. " + "default is none, that equals 'add'" + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Process input and output files with options." + ) + parser.add_argument("inputfile", type=str, help="Input file") + parser.add_argument("outputfile", type=str, help="Output file") + parser.add_argument( + "-desacc", + type=str, + choices=["yes", "no"], + help="Specify desacc attribute for entries. " + 'Default is none, that equals "yes"', + ) + parser.add_argument( + "-entryop", + type=str, + choices=["add", "replace", "delete"], + help="Specify op attribute for entries. " 'Default is none, that equals "add"', + ) + parser.add_argument( + "-lingop", + type=str, + choices=["add", "replace", "delete"], + help="Specify op attribute for linginfos. " + 'Default is none, that equals "add"', + ) + + args = parser.parse_args() + + with ( + codecs.open(args.inputfile, "r", "utf-8") as source, + codecs.open(args.outputfile, "a", "utf-8") as out + ): + form = "" + lemma = "" + norm = "" + + count = 0 + icount = 0 + + lines = source.readlines() + for line in tqdm(lines, desc="Processing lines", unit="line"): + line = line.strip() + line = ( + line.replace("&", "&") + .replace('"', """) + .replace("<", "<") + .replace(">", ">") + ) + if line == "": + continue + + line = line.split("#")[0].strip() + data = line.split("\t") + if line == "" or len(data) != 4: + print(f"xmlform: Invalid line '{line}'") + continue + + if data[0] != form: + form = data[0] + if count > 0: + out.write(" \n\n") + out.write(f'\n") + icount = 0 + count += 1 + + if icount == 0 or data[1] != lemma or data[2] != norm: + lemma = data[1] + norm = data[2] + if icount > 0: + out.write(" \n") + out.write(" \n") + icount += 1 + + out.write(f'

\n') + + if count > 0: + out.write(" \n\n") + + +if __name__ == "__main__": + main()