Skip to content

Commit

Permalink
Kalmyk paresr implementation -- ispras/lingvodoc-react#1119 (#1503)
Browse files Browse the repository at this point in the history
* use postgres:ready if is

* fix

* image: 'docker_api:latest'

* fix

* dirty fix

* docker.ini.template

* minor

* PGDATA

* added hfst parser

* added hfst parser

* pretty output

* correct output

* lib

* file dynamical name

* fix

* fix

* More correct translation

* Fix

* Revert "Fix"

This reverts commit 6150886.

* Fix

* /opt/hfst

* moved

* fixed requirement

* docker-compose-proxy.yml

* Cleanup

* Catch when no gloss

* Indicate markups with 'gr: Unknown' in any proposed variants with yellow color on OdtMarkupModal/PropertiesView

* fix
  • Loading branch information
vmonakhov authored Apr 14, 2024
1 parent c1676cd commit ae80aa4
Show file tree
Hide file tree
Showing 11 changed files with 26,394 additions and 13 deletions.
27 changes: 27 additions & 0 deletions alembic/versions/0fc45203d6ab_kalmyk_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Kalmyk parser added
Revision ID: 0fc45203d6ab
Revises: 6e02e6fdf0f9
Create Date: 2024-01-11 12:20:38.119574
"""

# revision identifiers, used by Alembic.
revision = '0fc45203d6ab'
down_revision = '6e02e6fdf0f9'
branch_labels = None
depends_on = None

from alembic import op

def upgrade():
op.execute('''
INSERT INTO public.parser(additional_metadata, created_at, object_id, client_id, name, parameters, method)
VALUES(null, '2024-01-11 12:20:38', 12, 1, 'Парсер калмыцкого языка (hfst)', '[]',
'hfst_kalmyk');
''')

def downgrade():
op.execute('''
DELETE FROM parser WHERE method = 'hfst_kalmyk';
''')
47 changes: 47 additions & 0 deletions aux_scripts/compile_xfst.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/python3
from hfst_dev import compile_xfst_file
import cgi
import cgitb
import shutil

STACK_FILENAME = 'rules.xfst.hfst'
DIR = '/var/www/cgi-bin/xal/'
TMP = '/var/www/tmp/'

cgitb.enable(format='text')
POST = cgi.FieldStorage()

print('Content-Type: text/html; charset=utf-8')
print('')

try:
LF = POST['LEXC'].filename
RF = POST['RULES'].filename
except:
LF = ''
RF = ''
if LF != '' and RF != '':
LexcFile = open(TMP + POST['LEXC'].filename, 'wb')
LexcFile.write(POST['LEXC'].file.read())
LexcFile.flush()
LexcFile.close()
RulesFile = open(TMP + POST['RULES'].filename, 'wb')
RulesFile.write(POST['RULES'].file.read())
RulesFile.flush()
RulesFile.close()
RulesFile = open(TMP + POST['RULES'].filename, 'a+')
RulesFile.write('\nsave stack ' + STACK_FILENAME)
RulesFile.flush()
RulesFile.close()
shutil.copyfile(TMP + POST['LEXC'].filename, DIR + POST['LEXC'].filename)
shutil.copyfile(TMP + POST['RULES'].filename, DIR + POST['RULES'].filename)
compile_xfst_file(DIR + POST['RULES'].filename)
print('XFST compiled!')
else:
print('''
<form method="post" enctype="multipart/form-data">
<input type="file" name="LEXC"><br/>
<input type="file" name="RULES"><br/>
<input type="submit" value="COMPILE!">
</form>
''')
2 changes: 2 additions & 0 deletions docker/docker-compose-proxy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ services:
volumes:
- ./frontend/dist:/dist
- /opt/apertium:/opt/apertium
- /opt/hfst:/opt/hfst
- ./sock:/sock
- /api/build/
- ../:/api
Expand All @@ -102,6 +103,7 @@ services:
volumes:
- ./frontend-proxy/dist:/dist
- /opt/apertium:/opt/apertium
- /opt/hfst:/opt/hfst
- ./sock-proxy:/sock
- /api/build/
- ../:/api
Expand Down
1 change: 1 addition & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ services:
volumes:
- ./frontend/dist:/dist
- /opt/apertium:/opt/apertium
- /opt/hfst:/opt/hfst
- ./sock:/sock
- /api/build/
- ../:/api
Expand Down
6 changes: 3 additions & 3 deletions lingvodoc/schema/gql_parserresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,10 +299,10 @@ def mutate(root, info, **args):

def get_parser_result_for_text(text, parse_method, apertium_path):
method = getattr(ParseMethods, parse_method)
if parse_method.find("timarkh") != -1:
result = method(text)
elif parse_method.find("apertium") != -1:
if parse_method.find("apertium") != -1:
result = method(text, apertium_path)
else:
result = method(text)
return result


Expand Down
25,882 changes: 25,882 additions & 0 deletions lingvodoc/static/parsers/hfst/xal/lexicon.lexc

Large diffs are not rendered by default.

351 changes: 351 additions & 0 deletions lingvodoc/static/parsers/hfst/xal/rules.xfst

Large diffs are not rendered by default.

Binary file not shown.
7 changes: 3 additions & 4 deletions lingvodoc/utils/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,11 +552,10 @@ def create_parser_result(
r = requests.post(url=dedoc_url, files=files, data=data)
dedoc_output = re.sub(r"(<sub>.*?</sub>)", "", r.content.decode('utf-8'))

if parser.method.find("timarkh") != -1:
result = parse_method(dedoc_output, **arguments)

elif parser.method.find("apertium") != -1:
if parser.method.find("apertium") != -1:
result = parse_method(dedoc_output, apertium_path, **arguments)
else:
result = parse_method(dedoc_output, **arguments)

dbparserresult = ParserResult(client_id=client_id, object_id=object_id,
parser_object_id=parser_object_id, parser_client_id=parser_client_id,
Expand Down
83 changes: 77 additions & 6 deletions lingvodoc/utils/doc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from uniparser_moksha import MokshaAnalyzer
from uniparser_komi_zyrian import KomiZyrianAnalyzer
from nltk.tokenize import RegexpTokenizer
from hfst_dev import HfstTransducer
from lxml.html import fromstring
import csv
import os
import tempfile
Expand All @@ -24,7 +26,7 @@ def print_to_str(*args, **kwargs):


span_id_counter = 0
def generate_html_wrap(word, ana_tag_list, lang=""):
def generate_html_wrap(word, ana_tag_list, lang="", extra_state=""):

json_list = list()
for ana_tag in ana_tag_list:
Expand All @@ -40,16 +42,16 @@ def generate_html_wrap(word, ana_tag_list, lang=""):

global span_id_counter
span_id_counter += 1
wrap = "<span class=\"unverified\"" + " id=" + str(span_id_counter) + ">"
wrap = f"<span class=\"unverified {extra_state}\"" + " id=" + str(span_id_counter) + ">"
for attr_json in json_list:
span_id_counter += 1
encoded_attrs = ((json.dumps(attr_json, ensure_ascii=False)).encode('utf8')).decode()
wrap += "<span class=\"result\"" + " id=" + str(span_id_counter) + ">" + encoded_attrs + "</span>"
wrap += f"<span class=\"result {extra_state}\"" + " id=" + str(span_id_counter) + ">" + encoded_attrs + "</span>"

if lang == 'udm' and 'nom' in encoded_attrs:
flag = True
span_id_counter += 1
wrap += "<span class=\"result\"" + " id=" + str(span_id_counter) + ">" + encoded_attrs.replace('nom', 'acc0') + "</span>"
wrap += f"<span class=\"result {extra_state}\"" + " id=" + str(span_id_counter) + ">" + encoded_attrs.replace('nom', 'acc0') + "</span>"

wrap += word + "</span>"
return wrap
Expand All @@ -68,8 +70,9 @@ def insert_parser_output_to_text(text, parser_output, lang=""):
if text[match_index-len(ESC_PAT):match_index] == ESC_PAT and text[match_index+len(word):match_index+len(word)+len(ESC_PAT)] == ESC_PAT:
continue
result_list.append(text[search_start_index:match_index])
if (len(w_tag.contents) > 1):
result_list.append(generate_html_wrap(word, w_tag.contents[0:-1], lang=lang))
if len(w_tag.contents) > 1:
extra_state = "broken" if any([a.get('gr') == "Unknown" for a in w_tag.find_all('ana')]) else ""
result_list.append(generate_html_wrap(word, w_tag.contents[0:-1], lang=lang, extra_state=extra_state))
search_start_index = match_index + len(word)
result_list.append(text[search_start_index:])
result = "".join(result_list)
Expand Down Expand Up @@ -358,6 +361,72 @@ def trans(elem):

return insert_parser_output_to_text(dedoc_output, parser_output, lang=lang)

def hfst_parser(dedoc_output, lang, debug_flag=False):

if debug_flag:
with open("dedoc_output", 'w') as f:
print(dedoc_output, file=f)

parser_path = f"/opt/hfst/{lang}"

with open(f"{parser_path}/lexicon.lexc", 'r') as f:
lexicon = f.read()

xfst = HfstTransducer.read_from_file(f"{parser_path}/rules.xfst.hfst")
xfst.invert()

sent_regex = re.compile(r'[.|!|?|...]')
word_regex = re.compile(r'[,| |:|"|-|*]')

words = 0
analyzed = 0
parser_list = []

# remove html tags from dedoc_output
text = fromstring(dedoc_output).text_content()
sentences = filter(lambda t: t, [t.strip() for t in sent_regex.split(text)])
for s in sentences:
wordlist = filter(lambda t: t, [t.strip() for t in word_regex.split(s)])
for w in wordlist:
words = words + 1
lookup = xfst.lookup(w)
if len(lookup) == 0:
lookup = xfst.lookup(w.lower())
if len(lookup) > 0:
analyzed = analyzed + 1
section = "'<w>"
for lkp in map(lambda l: l[0], lookup):

if '+' in lkp:
plus_pos = lkp.index('+')
lex = lkp[:plus_pos]
gr = lkp[plus_pos + 1:].replace('+', ',')
else:
lex = lkp
gr = "Unknown"

# Get translation
if ((xln := re.search(f"[\r\n]{lex}:{w} .*!([^0].*)[\r\n]", lexicon)) or
(xln := re.search(f"[\r\n]{lex}:{w.lower()} .*!([^0].*)[\r\n]", lexicon)) or
(xln := re.search(f"[\r\n]{lex}:.*!([^0].*)[\r\n]", lexicon))):
xln = xln.group(1)
else:
xln = "Unknown"

section += f'<ana lex={lex} gr={gr} parts="" gloss="" trans_ru={xln}></ana>'
section += f"{w}</w>'"
parser_list.append(section)
else:
parser_list.append(f'\'<w><ana lex="" gr="" parts="" gloss=""></ana>{w}</w>\'')

parser_output = ", ".join(parser_list)

if debug_flag:
with open("parser_output", 'w') as f:
print(parser_output, file=f)
print(f"Analyzed per word: {analyzed / words}")

return insert_parser_output_to_text(dedoc_output, parser_output, lang=lang)

def timarkh_udm(dedoc_output):
return timarkh_uniparser(dedoc_output, 'udm')
Expand Down Expand Up @@ -392,3 +461,5 @@ def apertium_bak(dedoc_output, apertium_path):
def apertium_rus(dedoc_output, apertium_path):
return apertium_parser(dedoc_output, apertium_path, 'rus')

def hfst_kalmyk(dedoc_output):
return hfst_parser(dedoc_output, 'xal')
1 change: 1 addition & 0 deletions server-requirements-1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ graphene==2.0.1
graphql-core==2.0
graphql-relay==0.4.5
gunicorn==19.7.1
hfst_dev==3.15.0.10b0
imagesize==1.1.0
iso8601==0.1.11
jdcal==1.4.1
Expand Down

0 comments on commit ae80aa4

Please sign in to comment.