Skip to content

Commit

Permalink
Merge pull request #1337 from ampli/amy
Browse files Browse the repository at this point in the history
More amy/anysplit modifications
  • Loading branch information
linas authored Aug 13, 2022
2 parents 05c95a9 + 56247cb commit 1744562
Show file tree
Hide file tree
Showing 5 changed files with 314 additions and 151 deletions.
16 changes: 10 additions & 6 deletions data/amy/4.0.affix
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@

% Anysplit parameters

% A PCRE2 regex defining a character sequence that shouldn't get split.
% The LG library must be configured with PCRE2 in order to use it. If not,
% or if this definition is missing, a single utf8 codpoint is used as a
% byte sequence that should not get split.
%#define atomic-unit "\X"; % split at grapheme boundaries.
#define atomic-unit "\X\pM*"; % ... but include trailing mark codepoints.

% Maximum number of word partitions
% FYI: 3 barely works, 4 and higher mostly do not work.
% 6: REGPARTS+;
Expand Down Expand Up @@ -47,16 +54,13 @@
% For ASCII input, the empty regexes can be used.
% See the comments in 4.0.affix.

%"" : REGPRE+;
"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGPRE+;
"" : REGPRE+;

% Regex to match the middle parts.
%"" : REGMID+;
"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGMID+;
"" : REGMID+;
%".{2,}": REGMID+;

% Regex to match the suffix.
%"" : REGSUF+;
"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGSUF+;
"" : REGSUF+;

% End of Anysplit parameters.
56 changes: 27 additions & 29 deletions data/amy/4.0.regex
Original file line number Diff line number Diff line change
Expand Up @@ -8,38 +8,36 @@
% The regexes here use the PCRE2 pattern syntax.
% The LG library must be configured with PCRE2 in order to use them.

% \X matches any Unicode grapheme.
% (?:(?=\p{Xan}) specifies that it should start with a letter or number.
% Similarly, \pM allows it to start with a mark character.
% Since most of the script-specific punctuation characters are not in
% the affix-punc file, they are allowed here to join to the end word/parts
% Most probably these regexes still reject valid word graphemes in some languages.
% \X matches any Unicode grapheme. \x03 matches the internal representation
% of the dot in STEMSUBSCR (See 4.0.affix).
%
% For information on graphemes see: http://www.unicode.org/reports/tr29/

% Want to match apostrophes, for abbreviations (I'm I've, etc.) since
% these cannot be auto-split with the current splitter.
% Hyphenated words, and words with underbars in them, get split.
ANY-WORD: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$/
ANY-PUNCT: /^[[:punct:]]+$/

% Multi-part random morphology: match any string as prefix, stem, or
% suffix.
% \x03 matches the internal representation of the dot in STEMSUBSCR
% (See 4.0.affix).

MOR-STEM: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*\x03=$/
MOR-PREF: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*=$/
MOR-SUFF: /^=(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*/

% For ASCII input, the following is enough (and it works even if the
% LG library is configured with a regex library other then PCRE2).
% To use it, uncomment it out and comment out the previous definitions.
% ANY-WORD: /^[[:alnum:]']+$/
% ANY-PUNCT: /^[[:punct:]]+$/
% MOR-PREF: /^[[:alnum:]']+=$/
% MOR-STEM: /^[[:alnum:]']+.=$/
% MOR-SUFF: /^=[[:alnum:]']+$/
% Punctuation characters are getting strip from start and end of words,
% and words that contain punctuation are getting split at them. See the
% "any/affix-punc" file.
% These punctuation characters will match here. The \x03 is to match
% subscripted punctuation that may be specified in this file.
ANY-PUNCT: /^[[:punct:]]+(:?\x03|$)/

% Multi-part random morphology: match any string as prefix, stem, or suffix.

MOR-STEM: /^\X+\x03=$/
MOR-PREF: /^\X+=$/
MOR-SUFF: /^=\X+/

% Reject anything that contains punctuation, so that the tokenizer will
% have a chance to split them off as affixes.
% Most of the script-dependent punctuation characters are not mentioned in
% the "any/affix-punc" file and thus may be included in words.
ANY-WORD: /^[^[:punct:]]+$/

% For ASCII input and non-PCRE2 regex libraries you can use these instead:
% ANY-WORD: /^[[:alnum:]]+$/
% ANY-PUNCT: /^[[:punct:]].*$/ % The .* is to match an optional subscript.
% MOR-PREF: /^[[:alnum:]]+=$/
% MOR-STEM: /^[[:alnum:]]+.=$/
% MOR-SUFF: /^=[[:alnum:]]+$/

% Match anything that doesn't match the above.
% Match anything that isn't white-space.
Expand Down
26 changes: 13 additions & 13 deletions data/any/affix-punc
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
")" "}" "]" ">" » 〉 ) 〕 》 】 ] 』」 """ "’’" "’" ''.y '.y
"%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ?! ….y ....y "”"
_ - ‐ ‑ ‒ – — ― ~ ━ ー 、
¢ ₵ ™ ℠ : RPUNC+;

"(" "{" "[" "<" « 〈 ( 〔 《 【 [ 『 「 """ `` „ “ ‘ ''.x '.x ….x ....x
¿ ¡ "$"
_ - ‐ ‑ ‒ – — ― ━ ー ~
£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점
† †† ‡ § ¶ © ® ℗ № "#": LPUNC+;

-- ‒ – — ― - _ "(" ")" "[" "]" ... … "," ";" ":"
': MPUNC+;
% Affixes get stripped off the left and right side of words
% i.e. spaces are inserted between the affix and the word itself.

% An LPUNC/RPUNC/MPUNC token can be specified as "/regex/.\N", when \N is
% the capture group that should match the affix (the whole pattern is
% capture group 0). Disregarding the position in which they appear, they
% are checked last - but in the same order. (Experimental.)

"’’" ''.y ….y ....y "/[[:punct:]]$/.\0": RPUNC+;

`` ''.x ….x ....x †† "/^[[:punct:]]/.\0": LPUNC+;

-- ... … "/[[:punct:]]/.\0" ': MPUNC+;
2 changes: 1 addition & 1 deletion link-grammar/dict-common/dict-impl.c
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ bool afdict_init(Dictionary dict)

for (int n = 0; n < ac->length - ac->Nregexes; n++)
{
if (!dict_has_word(dict, ac->string[n]))
if (!dictionary_word_is_known(dict, ac->string[n]))
{
if (!not_in_dict)
{
Expand Down
Loading

0 comments on commit 1744562

Please sign in to comment.