diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2da781464..7c662dbfa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,9 +1,9 @@
repos:
-- repo: https://github.com/ambv/black
- rev: 23.7.0
- hooks:
- - id: black
- language_version: python3.10
+ #- repo: https://github.com/ambv/black
+ # rev: 23.7.0
+ # hooks:
+ # - id: black
+ # language_version: python3.7
- repo: https://github.com/pycqa/flake8.git
rev: 3.9.1
hooks:
diff --git a/lima_linguisticdata/analysisDictionary/eng/convert/CMakeLists.txt b/lima_linguisticdata/analysisDictionary/eng/convert/CMakeLists.txt
index 04ba6cb36..5049f9d65 100644
--- a/lima_linguisticdata/analysisDictionary/eng/convert/CMakeLists.txt
+++ b/lima_linguisticdata/analysisDictionary/eng/convert/CMakeLists.txt
@@ -17,10 +17,10 @@ if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py eng forms.dic ${CMAKE_CURRENT_BINARY_DIR}/../../../disambiguisationMatrices/eng/corpus_eng_merge.txt forms.dic.2
#COMMAND LC_ALL="C" sort -u -o forms.dic.3 forms.dic.2
COMMAND LC_ALL="C" sort -u forms.dic.2 | LC_ALL="C" comm -23 - ${CMAKE_CURRENT_SOURCE_DIR}/to_ignore.dic > forms.dic.3
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/compound_entries.txt >> dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
@@ -40,10 +40,10 @@ else()
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py eng forms.dic ${CMAKE_CURRENT_BINARY_DIR}/../../../disambiguisationMatrices/eng/corpus_eng_merge.txt forms.dic.2
#COMMAND sort -u -o forms.dic.3 forms.dic.2
COMMAND sort -u forms.dic.2 | comm -23 - ${CMAKE_CURRENT_SOURCE_DIR}/to_ignore.dic > forms.dic.3
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/compound_entries.txt >> dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-eng.txt ../code/convjys.txt default-eng.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
diff --git a/lima_linguisticdata/analysisDictionary/fre/convert/CMakeLists.txt b/lima_linguisticdata/analysisDictionary/fre/convert/CMakeLists.txt
index f509cad62..640f77cae 100644
--- a/lima_linguisticdata/analysisDictionary/fre/convert/CMakeLists.txt
+++ b/lima_linguisticdata/analysisDictionary/fre/convert/CMakeLists.txt
@@ -14,9 +14,9 @@ if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
COMMAND cat ${dicoFiles} > forms.dic
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py fre forms.dic ${PROJECT_SOURCE_DIR}/disambiguisationMatrices/fre/corpus/corpus_fre.txt forms.dic.2
COMMAND LC_ALL="C" sort -u -o forms.dic.3 forms.dic.2
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
@@ -37,9 +37,9 @@ else()
# here without clear error messages, verify that the sort command used is
# not the Windows one.
COMMAND sort -u -o forms.dic.3 forms.dic.2
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-fre.txt ../code/convjys.txt default-fre.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
diff --git a/lima_linguisticdata/analysisDictionary/por/convert/CMakeLists.txt b/lima_linguisticdata/analysisDictionary/por/convert/CMakeLists.txt
index a02f2487d..bd75fe334 100644
--- a/lima_linguisticdata/analysisDictionary/por/convert/CMakeLists.txt
+++ b/lima_linguisticdata/analysisDictionary/por/convert/CMakeLists.txt
@@ -14,10 +14,10 @@ if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
COMMAND cat ${dicoFiles} > forms.dic
COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py por forms.dic ${PROJECT_SOURCE_DIR}/disambiguisationMatrices/por/corpus/macmorpho.conll.txt forms.dic.2
COMMAND LC_ALL="C" sort -u -o forms.dic.3 forms.dic.2
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
@@ -36,10 +36,10 @@ else()
COMMAND cat ${dicoFiles} > forms.dic
COMMAND python ${PROJECT_SOURCE_DIR}/scripts/merge-dico-poscorpus.py por forms.dic ${PROJECT_SOURCE_DIR}/disambiguisationMatrices/por/corpus/macmorpho.conll.txt forms.dic.2
COMMAND sort -u -o forms.dic.3 forms.dic.2
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl forms.dic.3 dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py forms.dic.3 dico.xml.tmp
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/../../closing_tag.txt >> dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-por.txt ../code/convjys.txt default-por.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS
${dicoFiles}
diff --git a/lima_linguisticdata/cmake/LinguisticData.cmake b/lima_linguisticdata/cmake/LinguisticData.cmake
index e4a3e2132..4cda79d1b 100644
--- a/lima_linguisticdata/cmake/LinguisticData.cmake
+++ b/lima_linguisticdata/cmake/LinguisticData.cmake
@@ -159,12 +159,12 @@ macro(CONVERT _lang)
add_custom_command(
OUTPUT dico.xml
COMMAND echo "" > dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl dicocompletstd.txt dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py dicocompletstd.txt dico.xml.tmp
COMMAND bash -c "if [ -n \"${ARGN}\" ]; then cat ${ARGN} >> dico.xml.tmp; fi"
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/addnormfield.pl ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt > dicoponctu.norm.txt
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no dicoponctu.norm.txt dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no dicoponctu.norm.txt dico.xml.tmp
COMMAND echo "" >> dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS dicocompletstd.txt ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt
COMMENT "CONVERT ${_lang} produce XML dico"
@@ -175,12 +175,12 @@ macro(CONVERT _lang)
add_custom_command(
OUTPUT dico.xml
COMMAND echo ^ > dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl dicocompletstd.txt dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py dicocompletstd.txt dico.xml.tmp
COMMAND bash -c "if [ -n \"${ARGN}\" ]; then cat ${ARGN} >> dico.xml.tmp; fi"
COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/addnormfield.pl ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt > dicoponctu.norm.txt
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/xmlforms.pl -desacc=no dicoponctu.norm.txt dico.xml.tmp
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/xmlforms.py -desacc=no dicoponctu.norm.txt dico.xml.tmp
COMMAND echo ^ >> dico.xml.tmp
- COMMAND perl ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.pl ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat
+ COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/cmakeconvertdefautjys.py ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt ../code/convjys.txt default-${_lang}.dat
COMMAND mv dico.xml.tmp dico.xml
DEPENDS dicocompletstd.txt ${CMAKE_CURRENT_SOURCE_DIR}/dicoponctu.txt ${CMAKE_CURRENT_SOURCE_DIR}/default-${_lang}.txt
COMMENT "produce XML dico"
diff --git a/lima_linguisticdata/rules-idiom/fre/examples/test-example.sh b/lima_linguisticdata/rules-idiom/fre/examples/test-example.sh
index f4db98f33..c88200202 100755
--- a/lima_linguisticdata/rules-idiom/fre/examples/test-example.sh
+++ b/lima_linguisticdata/rules-idiom/fre/examples/test-example.sh
@@ -25,7 +25,9 @@ for example in $*; do
if [[ -a $exampleFile ]]; then
cp $exampleFile resources/idiom-examples
- perl -pe 's%LinguisticProcessings/fre/idiomaticExpressions-fre.rules%idiom-examples/$ENV{"exampleFile"}%' $LIMA_CONF/lima-lp-tva-fre.xml > conf/lima-lp-tva-fre-example.xml;
+ python -c "import os, re; open('conf/lima-lp-tva-fre-example.xml', 'w', encoding='utf-8').write(re.sub(r'LinguisticProcessings/fre/idiomaticExpressions-fre.rules', f'idiom-examples/{os.getenv('exampleFile')}', open(os.path.join(os.getenv('LIMA_CONF'), 'lima-lp-tva-fre.xml'), 'r', encoding='utf-8').read()))"
+
+# perl -pe 's%LinguisticProcessings/fre/idiomaticExpressions-fre.rules%idiom-examples/$ENV{"exampleFile"}%' $LIMA_CONF/lima-lp-tva-fre.xml > conf/lima-lp-tva-fre-example.xml;
echo "running test on $exampleFile";
tva --language=fre --resources-dir=resources --config-dir=conf --lp-config-file=lima-lp-tva-example.xml idiom-example-fre-test.xml >& tva-$example.log;
egrep "(TestReport|total)" tva-$example.log
diff --git a/lima_linguisticdata/scripts/cmakeconvertdefautjys.py b/lima_linguisticdata/scripts/cmakeconvertdefautjys.py
new file mode 100644
index 000000000..229dff0d4
--- /dev/null
+++ b/lima_linguisticdata/scripts/cmakeconvertdefautjys.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Copyright 2002-2024 CEA LIST
+# SPDX-FileCopyrightText: 2024 CEA LIST
+#
+# SPDX-License-Identifier: MIT
+
+###############################################
+# Categories conversion program :
+# converts dicostd into dicojys (data multiplicative coding)
+###############################################
+
+import argparse
+import sys
+
+from tqdm import tqdm
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Convert dicostd into dicojys.")
+ parser.add_argument("source", type=str, help="Source file")
+ parser.add_argument("convert", type=str, help="Convert file")
+ parser.add_argument("cible", type=str, help="Cible file")
+
+ args = parser.parse_args()
+
+ print("INFO : Start default categories conversion", file=sys.stderr)
+
+ try:
+ source = open(args.source, "r", encoding="utf-8")
+ except IOError:
+ print(f"Cannot open {args.source}", file=sys.stderr)
+ sys.exit(1)
+
+ try:
+ convert = open(args.convert, "r", encoding="utf-8")
+ except IOError:
+ print(f"Cannot open {args.convert}", file=sys.stderr)
+ sys.exit(1)
+
+ try:
+ cible = open(args.cible, "w", encoding="utf-8")
+ except IOError:
+ print(f"Cannot open {args.cible}", file=sys.stderr)
+ sys.exit(1)
+
+ try:
+ error = open("error.txt", "a", encoding="utf-8")
+ except IOError:
+ print("Cannot open error.txt", file=sys.stderr)
+ sys.exit(1)
+
+ tags = {}
+ for line in convert:
+ donneestags = line.strip().split(";")
+ if len(donneestags) > 1:
+ tags[donneestags[0]] = donneestags[1]
+
+ convert.close()
+
+ line_num = 0
+
+ source_lines = source.readlines()
+ for line in tqdm(source_lines, desc="Processing lines", unit="line"):
+ line_num += 1
+ line = line.strip()
+ if len(line) == 0 or line.startswith("#"):
+ continue
+
+ donnees = line.split("\t")
+ if len(donnees) != 2:
+ error_message = (
+ f"in file {args.source} line {line_num}: "
+ f"wrong number of columns. Ignore line: {line}\n"
+ )
+ print(error_message, file=sys.stderr)
+ error.write(error_message)
+ continue
+
+ type_ = donnees[0]
+ info = donnees[1]
+
+ if info in tags and tags[info]:
+ cible.write(f"{type_}\t{tags[info]}\n")
+ else:
+ error_message = (
+ f"in file {args.source} line {line_num}: "
+ f"Invalid properties {type_} {info}\n"
+ )
+ print(error_message, file=sys.stderr)
+ error.write(error_message)
+
+ source.close()
+ cible.close()
+ error.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/lima_linguisticdata/scripts/xmlforms.py b/lima_linguisticdata/scripts/xmlforms.py
new file mode 100644
index 000000000..a49a7d4a2
--- /dev/null
+++ b/lima_linguisticdata/scripts/xmlforms.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+
+# Copyright 2002-2024 CEA LIST
+# SPDX-FileCopyrightText: 2024 CEA LIST
+#
+# SPDX-License-Identifier: MIT
+
+import argparse
+import codecs
+
+from tqdm import tqdm
+
+
+def print_usage():
+ print("USAGE : xmlforms [OPTIONS] inputfile outputfile")
+ print("where [OPTIONS] are :")
+ print(" -h or -help : print usage")
+ print(
+ " -desacc=[yes|no] : specify desacc attribute for entries. "
+ "default is none, that equals 'yes'"
+ )
+ print(
+ " -entryop=[add|replace|delete] : specify op attribute for entries. "
+ "default is none, that equals 'add'"
+ )
+ print(
+ " -lingop=[add|replace|delete] : specify op attribute for linginfos. "
+ "default is none, that equals 'add'"
+ )
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Process input and output files with options."
+ )
+ parser.add_argument("inputfile", type=str, help="Input file")
+ parser.add_argument("outputfile", type=str, help="Output file")
+ parser.add_argument(
+ "-desacc",
+ type=str,
+ choices=["yes", "no"],
+ help="Specify desacc attribute for entries. "
+ 'Default is none, that equals "yes"',
+ )
+ parser.add_argument(
+ "-entryop",
+ type=str,
+ choices=["add", "replace", "delete"],
+ help="Specify op attribute for entries. " 'Default is none, that equals "add"',
+ )
+ parser.add_argument(
+ "-lingop",
+ type=str,
+ choices=["add", "replace", "delete"],
+ help="Specify op attribute for linginfos. "
+ 'Default is none, that equals "add"',
+ )
+
+ args = parser.parse_args()
+
+ with (
+ codecs.open(args.inputfile, "r", "utf-8") as source,
+ codecs.open(args.outputfile, "a", "utf-8") as out
+ ):
+ form = ""
+ lemma = ""
+ norm = ""
+
+ count = 0
+ icount = 0
+
+ lines = source.readlines()
+ for line in tqdm(lines, desc="Processing lines", unit="line"):
+ line = line.strip()
+ line = (
+ line.replace("&", "&")
+ .replace('"', """)
+ .replace("<", "<")
+ .replace(">", ">")
+ )
+ if line == "":
+ continue
+
+ line = line.split("#")[0].strip()
+ data = line.split("\t")
+ if line == "" or len(data) != 4:
+ print(f"xmlform: Invalid line '{line}'")
+ continue
+
+ if data[0] != form:
+ form = data[0]
+ if count > 0:
+ out.write(" \n\n")
+ out.write(f'\n")
+ icount = 0
+ count += 1
+
+ if icount == 0 or data[1] != lemma or data[2] != norm:
+ lemma = data[1]
+ norm = data[2]
+ if icount > 0:
+ out.write(" \n")
+ out.write(" \n")
+ icount += 1
+
+ out.write(f' \n')
+
+ if count > 0:
+ out.write(" \n\n")
+
+
+if __name__ == "__main__":
+ main()