From 922586096840bd0054463f8453e11d36fc9c957c Mon Sep 17 00:00:00 2001 From: Irina Date: Wed, 29 Sep 2021 18:58:06 +1000 Subject: [PATCH 1/3] add onesegment property --- tokenizer/lib/spacy/processor.py | 5 +++-- tokenizer/schemas.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tokenizer/lib/spacy/processor.py b/tokenizer/lib/spacy/processor.py index ff59151..6e87b93 100644 --- a/tokenizer/lib/spacy/processor.py +++ b/tokenizer/lib/spacy/processor.py @@ -169,8 +169,9 @@ def _segmentize( for token in doc: if ( not token.is_space and not token._.is_meta ): if ( - (segmentOn == 'singleline' and token._.line_break_before) - or (segmentOn == 'doubleline' and token._.segment_break_before) + (segmentOn != 'onesegment') and + ((segmentOn == 'singleline' and token._.line_break_before) + or (segmentOn == 'doubleline' and token._.segment_break_before)) ): segmentIndex = segmentIndex + 1 segmeta, null = self.metaParser.parseLine( diff --git a/tokenizer/schemas.py b/tokenizer/schemas.py index 0e0c175..04511af 100644 --- a/tokenizer/schemas.py +++ b/tokenizer/schemas.py @@ -38,7 +38,7 @@ class TokenizeTextRequestSchema(Schema): required=False, missing="doubleline", description=gettext("Identify how segments are separated in the text."), - validate=validate.OneOf(["singleline","doubleline"]) + validate=validate.OneOf(["singleline","doubleline", "onesegment"]) ) lang = fields.Str( required=True, From 007274672124f0a7a6d690d5c7e7945091711390 Mon Sep 17 00:00:00 2001 From: Irina Date: Wed, 29 Sep 2021 20:39:06 +1000 Subject: [PATCH 2/3] set onesegment as default --- tokenizer/schemas.py | 2 +- tokenizer/tokenizer.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tokenizer/schemas.py b/tokenizer/schemas.py index 04511af..80b8384 100644 --- a/tokenizer/schemas.py +++ b/tokenizer/schemas.py @@ -36,7 +36,7 @@ class TokenizeTeiRequestSchema(Schema): class TokenizeTextRequestSchema(Schema): segments = fields.Str( required=False, - missing="doubleline", + missing="onesegment", description=gettext("Identify how segments are separated in the text."), validate=validate.OneOf(["singleline","doubleline", "onesegment"]) ) diff --git a/tokenizer/tokenizer.py b/tokenizer/tokenizer.py index 06f3d08..3b88a87 100644 --- a/tokenizer/tokenizer.py +++ b/tokenizer/tokenizer.py @@ -13,6 +13,7 @@ import json app = Flask("tokenizer") + ma = Marshmallow(app) babel = Babel(app) From 17019e5edb43afa2a6b46351eb7cc29ba2a8910e Mon Sep 17 00:00:00 2001 From: Irina Date: Wed, 29 Sep 2021 21:27:38 +1000 Subject: [PATCH 3/3] some dependencies update --- setup.py | 7 ++++--- tokenizer/tokenizer.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index f34dfb4..69f4f7e 100644 --- a/setup.py +++ b/setup.py @@ -14,10 +14,10 @@ "apispec<4.0.0", "apispec-webframeworks", "click<=7.2.0", - "flask==1.1.2", + "flask==1.1.4", "Flask-Babel", "Flask-Cache==0.13.1", - "flask-cors==2.0.0", + "flask-cors==3.0.10", "flask-marshmallow", "gunicorn", "jieba==0.42.1", @@ -32,7 +32,8 @@ "pymorphy2-dicts-uk", "pymorphy2", "pythainlp", - "pyvi" + "pyvi", + "Jinja2==2.11.3" ], tests_require=[ ], diff --git a/tokenizer/tokenizer.py b/tokenizer/tokenizer.py index 3b88a87..5125cca 100644 --- a/tokenizer/tokenizer.py +++ b/tokenizer/tokenizer.py @@ -13,7 +13,7 @@ import json app = Flask("tokenizer") - +# CORS(app) ma = Marshmallow(app) babel = Babel(app)