Merge pull request #1337 from ampli/amy

More amy/anysplit modifications
opencog · Aug 13, 2022 · 1744562 · 1744562
2 parents 05c95a9 + 56247cb
commit 1744562
Show file tree

Hide file tree

Showing 5 changed files with 314 additions and 151 deletions.
diff --git a/data/amy/4.0.affix b/data/amy/4.0.affix
@@ -17,6 +17,13 @@
 
 % Anysplit parameters
 
+% A PCRE2 regex defining a character sequence that shouldn't get split.
+% The LG library must be configured with PCRE2 in order to use it. If not,
+% or if this definition is missing, a single utf8 codpoint is used  as a
+% byte sequence that should not get split.
+%#define atomic-unit "\X";       % split at grapheme boundaries.
+#define atomic-unit "\X\pM*";   % ... but include trailing mark codepoints.
+
 % Maximum number of word partitions
 % FYI: 3 barely works, 4 and higher mostly do not work.
 % 6: REGPARTS+;
@@ -47,16 +54,13 @@
 % For ASCII input, the empty regexes can be used.
 % See the comments in 4.0.affix.
 
-%"" : REGPRE+;
-"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGPRE+;
+"" : REGPRE+;
 
 % Regex to match the middle parts.
-%"" : REGMID+;
-"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGMID+;
+"" : REGMID+;
 %".{2,}": REGMID+;
 
 % Regex to match the suffix.
-%"" : REGSUF+;
-"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGSUF+;
+"" : REGSUF+;
 
 % End of Anysplit parameters.
diff --git a/data/amy/4.0.regex b/data/amy/4.0.regex
@@ -8,38 +8,36 @@
 % The regexes here use the PCRE2 pattern syntax.
 % The LG library must be configured with PCRE2 in order to use them.
 
-% \X matches any Unicode grapheme.
-% (?:(?=\p{Xan}) specifies that it should start with a letter or number.
-% Similarly, \pM allows it to start with a mark character.
-% Since most of the script-specific punctuation characters are not in
-% the affix-punc file, they are allowed here to join to the end word/parts
-% Most probably these regexes still reject valid word graphemes in some languages.
+% \X matches any Unicode grapheme. \x03  matches the internal representation
+% of the dot in STEMSUBSCR (See 4.0.affix).
 %
 % For information on graphemes see: http://www.unicode.org/reports/tr29/
 
-% Want to match apostrophes, for abbreviations (I'm I've, etc.) since
-% these cannot be auto-split with the current splitter.
-% Hyphenated words, and words with underbars in them, get split.
-ANY-WORD:  /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$/
-ANY-PUNCT: /^[[:punct:]]+$/
-
-% Multi-part random morphology: match any string as prefix, stem, or
-% suffix.
-% \x03 matches the internal representation of the dot in STEMSUBSCR
-% (See 4.0.affix).
-
-MOR-STEM: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*\x03=$/
-MOR-PREF: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*=$/
-MOR-SUFF: /^=(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*/
-
-% For ASCII input, the following is enough (and it works even if the
-% LG library is configured with a regex library other then PCRE2).
-% To use it, uncomment it out and comment out the previous definitions.
-% ANY-WORD: /^[[:alnum:]']+$/
-% ANY-PUNCT: /^[[:punct:]]+$/
-% MOR-PREF: /^[[:alnum:]']+=$/
-% MOR-STEM: /^[[:alnum:]']+.=$/
-% MOR-SUFF: /^=[[:alnum:]']+$/
+% Punctuation characters are getting strip from start and end of words,
+% and words that contain punctuation are getting split at them.  See the
+% "any/affix-punc" file.
+% These punctuation characters will match here. The \x03 is to match
+% subscripted punctuation that may be specified in this file.
+ANY-PUNCT: /^[[:punct:]]+(:?\x03|$)/
+
+% Multi-part random morphology: match any string as prefix, stem, or suffix.
+
+MOR-STEM: /^\X+\x03=$/
+MOR-PREF: /^\X+=$/
+MOR-SUFF: /^=\X+/
+
+% Reject anything that contains punctuation, so that the tokenizer will
+% have a chance to split them off as affixes.
+% Most of the script-dependent punctuation characters are not mentioned in
+% the "any/affix-punc" file and thus may be included in words.
+ANY-WORD: /^[^[:punct:]]+$/
+
+% For ASCII input and non-PCRE2 regex libraries you can use these instead:
+% ANY-WORD: /^[[:alnum:]]+$/
+% ANY-PUNCT: /^[[:punct:]].*$/  % The .* is to match an optional subscript.
+% MOR-PREF: /^[[:alnum:]]+=$/
+% MOR-STEM: /^[[:alnum:]]+.=$/
+% MOR-SUFF: /^=[[:alnum:]]+$/
 
 % Match anything that doesn't match the above.
 % Match anything that isn't white-space.

diff --git a/data/any/affix-punc b/data/any/affix-punc
@@ -1,13 +1,13 @@
-")" "}" "]" ">" » 〉 ） 〕 》 】 ］ 』」 """ "’’" "’" ''.y '.y
-"%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ？！ ….y ....y "”"
-_ - ‐ ‑ ‒ – — ― ～ ━ ー 、
-¢ ₵ ™ ℠ : RPUNC+;
-
-"(" "{" "[" "<" « 〈 （ 〔 《 【 ［ 『 「 """  `` „ “ ‘ ''.x '.x ….x ....x
-¿ ¡ "$"
-_ - ‐ ‑ ‒ – — ― ━ ー ～
-£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺  ℳ  ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점
-† †† ‡ § ¶ © ® ℗ № "#": LPUNC+;
-
--- ‒ – — ― - _ "(" ")" "[" "]" ... … "," ";" ":"
-': MPUNC+;
+% Affixes get stripped off the left and right side of words
+% i.e. spaces are inserted between the affix and the word itself.
+
+% An LPUNC/RPUNC/MPUNC token can be specified as "/regex/.\N", when \N is
+% the capture group that should match the affix (the whole pattern is
+% capture group 0). Disregarding the position in which they appear, they
+% are checked last - but in the same order. (Experimental.)
+
+"’’" ''.y ….y ....y "/[[:punct:]]$/.\0": RPUNC+;
+
+`` ''.x ….x ....x †† "/^[[:punct:]]/.\0": LPUNC+;
+
+-- ... … "/[[:punct:]]/.\0" ': MPUNC+;
diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c
@@ -805,7 +805,7 @@ bool afdict_init(Dictionary dict)
 
 				for (int n = 0;  n < ac->length - ac->Nregexes; n++)
 				{
-					if (!dict_has_word(dict, ac->string[n]))
+					if (!dictionary_word_is_known(dict, ac->string[n]))
 					{
 						if (!not_in_dict)
 						{