From 5825dd223121ec0a0c83d297124b8c93741dddf5 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 8 Jul 2022 19:49:20 +0300 Subject: [PATCH 01/23] amy/4.0.affix,amy/4.0.regex: Simplify the regexes This doesn't work yet for splitting on grapheme boundaries, because ^X matches at leas one codepoint so it matches a split initial morpheme in a part. This change is needed for the upcoming new code to split at grapheme boundaries. --- data/amy/4.0.affix | 6 +++--- data/amy/4.0.regex | 38 +++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/data/amy/4.0.affix b/data/amy/4.0.affix index eb773751d0..b6cd8b06e5 100644 --- a/data/amy/4.0.affix +++ b/data/amy/4.0.affix @@ -48,15 +48,15 @@ % See the comments in 4.0.affix. %"" : REGPRE+; -"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGPRE+; +"^\X+$" : REGPRE+; % Regex to match the middle parts. %"" : REGMID+; -"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGMID+; +"^\X+$" : REGMID+; %".{2,}": REGMID+; % Regex to match the suffix. %"" : REGSUF+; -"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGSUF+; +"^\X+$" : REGSUF+; % End of Anysplit parameters. diff --git a/data/amy/4.0.regex b/data/amy/4.0.regex index 3129716434..db822164bc 100644 --- a/data/amy/4.0.regex +++ b/data/amy/4.0.regex @@ -9,37 +9,37 @@ % The LG library must be configured with PCRE2 in order to use them. % \X matches any Unicode grapheme. -% (?:(?=\p{Xan}) specifies that it should start with a letter or number. -% Similarly, \pM allows it to start with a mark character. % Since most of the script-specific punctuation characters are not in % the affix-punc file, they are allowed here to join to the end word/parts -% Most probably these regexes still reject valid word graphemes in some languages. % % For information on graphemes see: http://www.unicode.org/reports/tr29/ -% Want to match apostrophes, for abbreviations (I'm I've, etc.) since -% these cannot be auto-split with the current splitter. -% Hyphenated words, and words with underbars in them, get split. -ANY-WORD: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$/ +% Hyphenated words, contractions, and words with underbars in them, get split, +% along with many other punctuation characters that get strip from start +% and end of words. See the "any/affix-punc" file. These punctuation +% characters will match here. ANY-PUNCT: /^[[:punct:]]+$/ -% Multi-part random morphology: match any string as prefix, stem, or -% suffix. +% Multi-part random morphology: match any string as prefix, stem, or suffix. % \x03 matches the internal representation of the dot in STEMSUBSCR % (See 4.0.affix). -MOR-STEM: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*\x03=$/ -MOR-PREF: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*=$/ -MOR-SUFF: /^=(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*/ +MOR-STEM: /^\X+\x03=$/ +MOR-PREF: /^\X+=$/ +MOR-SUFF: /^=\X+/ -% For ASCII input, the following is enough (and it works even if the -% LG library is configured with a regex library other then PCRE2). -% To use it, uncomment it out and comment out the previous definitions. -% ANY-WORD: /^[[:alnum:]']+$/ +% Reject anything that contains punctuation, so that the tokenizer will +% have a chance to split them off as affixes. +% Most of the script-dependent punctuation characters are not mentioned in +% the "any/affix-punc" file and thus may be included in words. +ANY-WORD: /^[^[:punct:]]+$/ + +% For ASCII input and non-PCRE2 regex libraries you can use these instead: +% ANY-WORD: /^[[:alnum:]]+$/ % ANY-PUNCT: /^[[:punct:]]+$/ -% MOR-PREF: /^[[:alnum:]']+=$/ -% MOR-STEM: /^[[:alnum:]']+.=$/ -% MOR-SUFF: /^=[[:alnum:]']+$/ +% MOR-PREF: /^[[:alnum:]]+=$/ +% MOR-STEM: /^[[:alnum:]]+.=$/ +% MOR-SUFF: /^=[[:alnum:]]+$/ % Match anything that doesn't match the above. % Match anything that isn't white-space. From 36901d055ddd978cab73e1292118754c74cf9c2f Mon Sep 17 00:00:00 2001 From: ampli Date: Sat, 9 Jul 2022 02:04:01 +0300 Subject: [PATCH 02/23] anysplit(): Move the sanity checks to the start --- link-grammar/tokenize/anysplit.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 02593939bd..6f6d2d750c 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -433,9 +433,12 @@ bool anysplit_init(Dictionary afdict) #define D_AS 5 bool anysplit(Sentence sent, Gword *unsplit_word) { - const char * word = unsplit_word->subword; Dictionary afdict = sent->dict->affix_table; - anysplit_params *as; + if (NULL == afdict) return false; + anysplit_params * as = afdict->anysplit; + if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */ + + const char * word = unsplit_word->subword; Afdict_class * stemsubscr; size_t l = strlen(word); @@ -452,11 +455,6 @@ bool anysplit(Sentence sent, Gword *unsplit_word) char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */ bool use_sampling = true; - if (NULL == afdict) return false; - as = afdict->anysplit; - - if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */ - if (lutf > MAX_WORD_TO_SPLIT) { Gword *alt = issue_word_alternative(sent, unsplit_word, "AS>", From f3c7d8acaec134379cdf71d40f08a1c9e81e5ef0 Mon Sep 17 00:00:00 2001 From: ampli Date: Sat, 9 Jul 2022 02:01:34 +0300 Subject: [PATCH 03/23] free_anysplit(): Move it to be near its usage --- link-grammar/tokenize/anysplit.c | 41 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 6f6d2d750c..da2acbf038 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -187,27 +187,6 @@ static int split_and_cache(int word_length, int nparts, split_cache *scl) return maxindex+1; } -void free_anysplit(Dictionary afdict) -{ - size_t i; - anysplit_params *as = afdict->anysplit; - - if (NULL == as) return; - - for (i = 0; i < ARRAY_SIZE(as->scl); i++) - { - if (NULL == as->scl[i].sp) continue; - free(as->scl[i].sp); - free(as->scl[i].p_selected); - free(as->scl[i].p_tried); - } - free_regexs(as->regpre); - free_regexs(as->regmid); - free_regexs(as->regsuf); - free(as); - afdict->anysplit = NULL; -} - /* * Returns: Number of splits. */ @@ -327,6 +306,26 @@ static Regex_node * regbuild(const char **regstring, int n, int classnum) return regex_root; } +void free_anysplit(Dictionary afdict) +{ + size_t i; + anysplit_params *as = afdict->anysplit; + + if (NULL == as) return; + + for (i = 0; i < ARRAY_SIZE(as->scl); i++) + { + if (NULL == as->scl[i].sp) continue; + free(as->scl[i].sp); + free(as->scl[i].p_selected); + free(as->scl[i].p_tried); + } + free_regexs(as->regpre); + free_regexs(as->regmid); + free_regexs(as->regsuf); + free(as); + afdict->anysplit = NULL; +} /** * Affix classes: From 8df81c591d2c0c32cfbdc3954c8e89080a93d8e9 Mon Sep 17 00:00:00 2001 From: ampli Date: Sat, 9 Jul 2022 02:28:37 +0300 Subject: [PATCH 04/23] morpheme_match(): Rename prefix_string to word_part --- link-grammar/tokenize/anysplit.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index da2acbf038..e754378966 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -245,13 +245,13 @@ static bool morpheme_match(Sentence sent, int p; Regex_node *re; size_t blen = strlen(word); - char *prefix_string = alloca(blen+1); + char *word_part = alloca(blen+1); lgdebug(+D_MM, "word=%s: ", word); for (p = 0; p < as->nparts; p++) { - size_t b = utf8_strncpy(prefix_string, &word[bos], pl[p]-cpos); - prefix_string[b] = '\0'; + size_t b = utf8_strncpy(word_part, &word[bos], pl[p]-cpos); + word_part[b] = '\0'; bos += b; /* For flexibility, REGRPE is matched only to the prefix part, @@ -260,10 +260,10 @@ static bool morpheme_match(Sentence sent, if (0 == p) re = as->regpre; else if (pl[p] == (int) lutf) re = as->regsuf; else re = as->regmid; - lgdebug(D_MM, "re=%s part%d=%s: ", re?re->name:"(nil)", p, prefix_string); + lgdebug(D_MM, "re=%s part%d=%s: ", re?re->name:"(nil)", p, word_part); /* A NULL regex always matches */ - if ((NULL != re) && (NULL == match_regex(re, prefix_string))) + if ((NULL != re) && (NULL == match_regex(re, word_part))) { lgdebug(D_MM, "No match\n"); return false; From b22706c736bce2af4300db63ad17d9298187cbaa Mon Sep 17 00:00:00 2001 From: ampli Date: Sat, 9 Jul 2022 02:30:41 +0300 Subject: [PATCH 05/23] anysplit(): Remove commented-out code line --- link-grammar/tokenize/anysplit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index e754378966..602fe8f27c 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -570,7 +570,6 @@ bool anysplit(Sentence sent, Gword *unsplit_word) bos += b; cpos = pl[p]; - // if (cpos == lutf) break; /* Same thing as below...*/ if (bos == l) break; } From 97fd0c790fc95271c335821e4e45fc6a53de076d Mon Sep 17 00:00:00 2001 From: ampli Date: Sun, 10 Jul 2022 00:33:07 +0300 Subject: [PATCH 06/23] anysplit,c: Fix a comment rot --- link-grammar/tokenize/anysplit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 602fe8f27c..c153330593 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -43,7 +43,9 @@ #define MAX_WORD_TO_SPLIT 63 /* in codepoints */ -typedef int p_start; /* partition start in a word */ +extern const char * const afdict_classname[]; + +typedef int p_start; /* partition end in a word (end char position + 1) */ typedef p_start *p_list; /* list of partitions in a word */ typedef struct split_cache /* split cached by word length */ From 73eaff66885748f1fbd5669c3e5b39062a068c69 Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 11 Jul 2022 16:42:55 +0300 Subject: [PATCH 07/23] anysplit.c: Rename p_start to p_end --- link-grammar/tokenize/anysplit.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index c153330593..2740054358 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -45,8 +45,8 @@ extern const char * const afdict_classname[]; -typedef int p_start; /* partition end in a word (end char position + 1) */ -typedef p_start *p_list; /* list of partitions in a word */ +typedef int p_end; /* partition end in a word (end char position + 1) */ +typedef p_end *p_list; /* list of partitions in a word */ typedef struct split_cache /* split cached by word length */ { @@ -102,7 +102,7 @@ static void printps(int *ps, int n) static void cache_partitions(p_list pl, int *ps, int p) { - memcpy(pl, ps, sizeof(p_start) * p); + memcpy(pl, ps, sizeof(p_end) * p); } /* p = 5 */ @@ -126,7 +126,7 @@ static int split_and_cache(int word_length, int nparts, split_cache *scl) int n; int maxindex; - p_list ps = alloca(sizeof(p_start)*nparts); /* partition start */ + p_list ps = alloca(sizeof(p_end)*nparts); if (0 == word_length) return 0; @@ -206,7 +206,7 @@ static int split(int word_length, int nparts, split_cache *scl) word_length, nparts); return 0; } - scl->sp = malloc(sizeof(p_start)*nparts * nsplits); + scl->sp = malloc(sizeof(p_end)*nparts * nsplits); scl->p_selected = malloc(sizeof(*(scl->p_selected)) * nsplits); scl->p_tried = malloc(sizeof(*(scl->p_tried)) * nsplits); split_and_cache(word_length, nparts, scl); @@ -260,7 +260,7 @@ static bool morpheme_match(Sentence sent, * REGMID only to the middle suffixes, and REGSUF only to the * suffix part - which cannot be the prefix. */ if (0 == p) re = as->regpre; - else if (pl[p] == (int) lutf) re = as->regsuf; + else if (pl[p] == (p_end)lutf) re = as->regsuf; else re = as->regmid; lgdebug(D_MM, "re=%s part%d=%s: ", re?re->name:"(nil)", p, word_part); From af8e4816f496819f3fdb3e2044e84d54ed496f74 Mon Sep 17 00:00:00 2001 From: ampli Date: Sun, 10 Jul 2022 01:28:27 +0300 Subject: [PATCH 08/23] morpheme_match(): Update description --- link-grammar/tokenize/anysplit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 2740054358..81a01004cd 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -234,8 +234,9 @@ static int rng_uniform(unsigned int *seedp, size_t nsplits) } -/* lutf is the length of the string, measured in code-points, - * blen is the length of the string, measured in bytes. +/** + * Match the \p word parts \p pl to REGPRE, REGMID, and REGSUF. + * All the parts must match. */ #define D_MM 7 static bool morpheme_match(Sentence sent, From 1576152038d5a4f18587b63831eeb76d030a5eda Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 11 Jul 2022 01:50:10 +0300 Subject: [PATCH 09/23] anysplit.c: Define D_ANYS as the verbosity level for this file --- link-grammar/tokenize/anysplit.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 81a01004cd..ce49232977 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -9,6 +9,8 @@ /* */ /*************************************************************************/ +#define D_ANYS 5 /* Debug level for this file (6 for more) */ + /** * anysplit.c -- code that splits words into random morphemes. * This is used for the language-learning/morpheme-learning project. @@ -432,7 +434,6 @@ bool anysplit_init(Dictionary afdict) * - an error occurs (the behavior then is undefined). * Such an error has not been observed yet. */ -#define D_AS 5 bool anysplit(Sentence sent, Gword *unsplit_word) { Dictionary afdict = sent->dict->affix_table; @@ -493,7 +494,7 @@ bool anysplit(Sentence sent, Gword *unsplit_word) use_sampling = false; } - lgdebug(+D_AS, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, " + lgdebug(+D_ANYS, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, " "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no", word, nsplits, as->nparts, as->altsmin, as->altsmax); @@ -514,13 +515,13 @@ bool anysplit(Sentence sent, Gword *unsplit_word) sample_point++; } - lgdebug(D_AS, "Sample: %d ", sample_point); if (as->scl[lutf].p_tried[sample_point]) + lgdebug(D_ANYS, "Sample: %d ", sample_point); { - lgdebug(D_AS+1, "(repeated)\n"); + lgdebug(D_ANYS+1, "(repeated)\n"); continue; } - lgdebug(D_AS+1, "(new)"); + lgdebug(D_ANYS+1, "(new)"); rndtried++; as->scl[lutf].p_tried[sample_point] = true; /* The regexes in the affix file can be used to reject partitioning @@ -532,11 +533,11 @@ bool anysplit(Sentence sent, Gword *unsplit_word) } else { - lgdebug(D_AS, "\n"); + lgdebug(D_ANYS, "\n"); } } - lgdebug(D_AS, "Results: word '%s' (utf-char=%zu utf-byte-length=%zu): %zu/%zu:\n", + lgdebug(D_ANYS, "Results: word '%s' (utf-char=%zu utf-byte-length=%zu): %zu/%zu:\n", word, lutf, l, rndissued, nsplits); for (i = 0; i < nsplits; i++) @@ -622,4 +623,3 @@ bool anysplit(Sentence sent, Gword *unsplit_word) if (0 != sent->rand_state) sent->rand_state = seed; return true; } -#undef D_AS From 128fccdc216a04dc19aad314846ca7f10cccbfe0 Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 11 Jul 2022 22:12:29 +0300 Subject: [PATCH 10/23] anysplit(): Move 0 length check to the start --- link-grammar/tokenize/anysplit.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index ce49232977..05897f4e12 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -444,7 +444,6 @@ bool anysplit(Sentence sent, Gword *unsplit_word) const char * word = unsplit_word->subword; Afdict_class * stemsubscr; - size_t l = strlen(word); size_t lutf = utf8_strlen(word); p_list pl; size_t bos, cpos; /* byte offset, codepoint offset */ @@ -455,10 +454,18 @@ bool anysplit(Sentence sent, Gword *unsplit_word) size_t rndissued = 0; size_t i; unsigned int seed = sent->rand_state; - char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */ bool use_sampling = true; + size_t l = strlen(word); + char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */ + if (lutf > MAX_WORD_TO_SPLIT) + if (0 == l) + { + prt_error("Warning: anysplit(): word length 0\n"); + return false; + } + { Gword *alt = issue_word_alternative(sent, unsplit_word, "AS>", 0,NULL, 1,&word, 0,NULL); From 53dd29f144c070924cfc8a8aeeb067d66bf6c4e6 Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 11 Jul 2022 22:14:20 +0300 Subject: [PATCH 11/23] anysplit.c: Include pcre2.h --- link-grammar/tokenize/anysplit.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 05897f4e12..a46a87ade7 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -29,6 +29,10 @@ #include #include #include +#if HAVE_PCRE2_H // use it to split in grapheme boundaries +#define PCRE2_CODE_UNIT_WIDTH 8 +#include +#endif // HAVE_PCRE2_H #include "api-structures.h" #include "dict-common/dict-affix.h" From aa3e7f1dffc12582e1f652f851f5fa540f8d9ebe Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 11 Jul 2022 22:16:29 +0300 Subject: [PATCH 12/23] anysplit.c: Add data structure for grapheme separation --- link-grammar/tokenize/anysplit.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index a46a87ade7..93bf157008 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -47,7 +47,7 @@ #include "anysplit.h" -#define MAX_WORD_TO_SPLIT 63 /* in codepoints */ +#define MAX_WORD_TO_SPLIT 63 /* in codepoints (or graphemes if HAVE_PCRE2_H) */ extern const char * const afdict_classname[]; @@ -62,12 +62,23 @@ typedef struct split_cache /* split cached by word length */ bool *p_selected; /* list of selected splits */ } split_cache; +#if HAVE_PCRE2_H +typedef struct { + char *pattern; + pcre2_code *code; + pcre2_match_data* match_data; +} grapheme_regex; +#endif + typedef struct anysplit_params { int nparts; /* maximum number of suffixes to split to */ size_t altsmin; /* minimum number of alternatives to generate */ size_t altsmax; /* maximum number of alternatives to generate */ Regex_node *regpre, *regmid, *regsuf; /* issue matching combinations */ +#if HAVE_PCRE2_H + grapheme_regex gr; +#endif // HAVE_PCRE2_H split_cache scl[MAX_WORD_TO_SPLIT+1]; /* split cache according to word length */ } anysplit_params; From 3193e89ea67e0edd04e9994a291ed18117f489e4 Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 11 Jul 2022 22:21:22 +0300 Subject: [PATCH 13/23] anysplit.c: Add functions for grapheme separation --- link-grammar/tokenize/anysplit.c | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 93bf157008..4929e44dc1 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -326,6 +326,64 @@ static Regex_node * regbuild(const char **regstring, int n, int classnum) return regex_root; } +#if HAVE_PCRE2_H +static bool gr_reg_comp(grapheme_regex *re) +{ + PCRE2_SIZE erroffset; + int rc; + + re->code = pcre2_compile((PCRE2_SPTR)re->pattern, PCRE2_ZERO_TERMINATED, + PCRE2_UTF, &rc, &erroffset, NULL); + if (re->code != NULL) + { + re->match_data = pcre2_match_data_create_from_pattern(re->code, NULL); + if (re->match_data == NULL) + { + prt_error("Error: pcre2_match_data_create_from_pattern() failed\n"); + pcre2_code_free(re->code); + return false; + } + return true; + } + + /* We have an error. */ +#define ERRBUFFLEN 120 + PCRE2_UCHAR errbuf[ERRBUFFLEN]; + pcre2_get_error_message(rc, errbuf, ERRBUFFLEN); + prt_error("Error: Failed to compile grapheme regex \"%s\": %s (code %d) at %d\n", + re->pattern, errbuf, rc, (int)erroffset); + return false; +} + +static int gr_reg_match(const char *word, grapheme_regex *re) +{ + int rc = pcre2_match(re->code, (PCRE2_SPTR)word, + PCRE2_ZERO_TERMINATED, /*startoffset*/0, + PCRE2_NO_UTF_CHECK, re->match_data, NULL); + if (rc == PCRE2_ERROR_NOMATCH) return rc; + if (rc > 0) return rc; + if (rc == 0) + { + prt_error("Error: pcre2_match(): ovector: Internal error\"\n"); + return rc; + } + + /* We have an error. */ + PCRE2_UCHAR errbuf[ERRBUFFLEN]; + pcre2_get_error_message(rc, errbuf, ERRBUFFLEN); + prt_error("Error: pcre2_match(): \"%s\": %s (code %d)\n", + re->pattern, errbuf, rc); + return rc; +} + +static void gr_pcre2_free(grapheme_regex *re) +{ + free(re->pattern); + pcre2_match_data_free(re->match_data); + pcre2_code_free(re->code); +} +#endif // HAVE_PCRE2_H + void free_anysplit(Dictionary afdict) { size_t i; From cf6a49818b25a0eaeb4e88d9b53757fe5d8aed35 Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 11 Jul 2022 22:37:50 +0300 Subject: [PATCH 14/23] anysplit.c: Add ability to split on grapheme boundaries --- link-grammar/tokenize/anysplit.c | 175 +++++++++++++++++++++---------- 1 file changed, 122 insertions(+), 53 deletions(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 4929e44dc1..3a51debafc 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -256,21 +256,21 @@ static int rng_uniform(unsigned int *seedp, size_t nsplits) * All the parts must match. */ #define D_MM 7 -static bool morpheme_match(Sentence sent, - const char *word, size_t lutf, p_list pl) +static bool morpheme_match(Sentence sent, const char *word, unsigned int nunits, + unsigned int *word_upos, p_list pl) { Dictionary afdict = sent->dict->affix_table; anysplit_params *as = afdict->anysplit; - size_t bos = 0, cpos = 0; /* byte offset, code-point offset */ - int p; - Regex_node *re; - size_t blen = strlen(word); - char *word_part = alloca(blen+1); + char *word_part = alloca(strlen(word) + 1); lgdebug(+D_MM, "word=%s: ", word); - for (p = 0; p < as->nparts; p++) + for (int p = 0; p < as->nparts; p++) { - size_t b = utf8_strncpy(word_part, &word[bos], pl[p]-cpos); + size_t bos = 0, upos = 0; /* word offset, unit offset (both in bytes) */ + size_t b = word_upos[pl[p] - 1] - upos; + Regex_node *re; + + memcpy(word_part, &word[bos], b); word_part[b] = '\0'; bos += b; @@ -278,7 +278,7 @@ static bool morpheme_match(Sentence sent, * REGMID only to the middle suffixes, and REGSUF only to the * suffix part - which cannot be the prefix. */ if (0 == p) re = as->regpre; - else if (pl[p] == (p_end)lutf) re = as->regsuf; + else if (pl[p] == (p_end)nunits) re = as->regsuf; else re = as->regmid; lgdebug(D_MM, "re=%s part%d=%s: ", re?re->name:"(nil)", p, word_part); @@ -289,8 +289,8 @@ static bool morpheme_match(Sentence sent, return false; } - cpos = pl[p]; - if (cpos == lutf) break; + if (pl[p] == (int)nunits) break; + upos = word_upos[pl[p] - 1]; } lgdebug(D_MM, "Match\n"); @@ -401,6 +401,11 @@ void free_anysplit(Dictionary afdict) free_regexs(as->regpre); free_regexs(as->regmid); free_regexs(as->regsuf); + +#if HAVE_PCRE2_H + gr_pcre2_free(&as->gr); +#endif // HAVE_PCRE2_H + free(as); afdict->anysplit = NULL; } @@ -495,10 +500,93 @@ bool anysplit_init(Dictionary afdict) return false; } +#if HAVE_PCRE2_H + const char upat[] = "\\X"; + const char bpat[] = "^(?>"; + const char epat[] = "(.+)?)$"; + + // Build an optional match fore a single grapheme. + const unsigned int ubuf_strlen = strlen(upat) + /*()?*/3; + char *ubuf = alloca(ubuf_strlen + 1); + snprintf(ubuf, ubuf_strlen + 1, "(%s)?", upat); + + // Build a pattern to match all the graphemes in a word: "^(>(\\X)?...)$" + as->gr.pattern = + malloc(sizeof(bpat)-1 + ubuf_strlen * MAX_WORD_TO_SPLIT + sizeof(epat)); + strcpy(as->gr.pattern, bpat); + unsigned int n = strlen(as->gr.pattern); + for (i = 0; i < MAX_WORD_TO_SPLIT; i++, n+= ubuf_strlen) + strcpy(&as->gr.pattern[n], ubuf); + strcpy(&as->gr.pattern[n], epat); + + if (!gr_reg_comp(&as->gr)) return false; +#endif // HAVE_PCRE2_H + return true; } #undef D_AI +/* + * Return the number of units (codepoints or graphemes) in \p word. + * On error, return 0; + */ +static unsigned int strlen_units(anysplit_params *as, const char *word) +{ +#if !HAVE_PCRE2_H + // Number of codepoints. + return (unsigned int)utf8_strlen(word); +#else // HAVE_PCRE2_H + // Number of graphemes. + int rc = gr_reg_match(word, &as->gr); + if (rc <= 1) return 0; + return (unsigned int)(rc - 1); +#endif // !HAVE_PCRE2_H +} + +/** + * Set the elements of \p word_pos (containing \p nunits elements) to the + * end positions (last char position + 1) of the atomic units in \p word. + */ +static void build_unit_positions(anysplit_params *as, const char *word, + unsigned int nunits, unsigned int *word_pos) +{ + dassert(nunits != 0, "At least one atomic unit is expected"); + const unsigned int *word_pos_base = word_pos; + +#if HAVE_PCRE2_H + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(as->gr.match_data); + + /* The first [start,end) is of the whole match (the word in this case). */ + for (unsigned int i = 1; i < nunits + 1; i++) + *word_pos++ = (unsigned int)ovector[2*i + 1]; + +#else + unsigned int pos = 0; + + for (unsigned int i = 0; word[i] != '\0'; i = pos) + { + pos += utf8_charlen(&word[i]); + *word_pos++ = pos; + } + +#endif // HAVE_PCRE2_H + + if (verbosity_level(D_ANYS+1)) + { + unsigned int bos = 0; + + prt_error("Debug: %u atomic units:\n\\", nunits); + for (unsigned int i = 0; i < nunits; i ++) + { + prt_error("%u) %.*s\n\\", i+1,(int)(word_pos_base[i]-bos), &word[bos]); + bos = word_pos_base[i]; + } + prt_error("\n"); + + } + dassert(word_pos[-1] == strlen(word), "Inconsistent word end"); +} + /** * Split randomly. * Return true on success. @@ -517,10 +605,6 @@ bool anysplit(Sentence sent, Gword *unsplit_word) const char * word = unsplit_word->subword; Afdict_class * stemsubscr; - size_t lutf = utf8_strlen(word); - p_list pl; - size_t bos, cpos; /* byte offset, codepoint offset */ - int p; int sample_point; size_t nsplits; size_t rndtried = 0; @@ -528,17 +612,18 @@ bool anysplit(Sentence sent, Gword *unsplit_word) size_t i; unsigned int seed = sent->rand_state; bool use_sampling = true; + unsigned int nunits = strlen_units(as, word); size_t l = strlen(word); char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */ - if (lutf > MAX_WORD_TO_SPLIT) if (0 == l) { prt_error("Warning: anysplit(): word length 0\n"); return false; } + if ((nunits > MAX_WORD_TO_SPLIT) || (nunits == 0)) { Gword *alt = issue_word_alternative(sent, unsplit_word, "AS>", 0,NULL, 1,&word, 0,NULL); @@ -546,11 +631,8 @@ bool anysplit(Sentence sent, Gword *unsplit_word) return true; } - if (0 == l) - { - prt_error("Warning: anysplit(): word length 0\n"); - return false; - } + unsigned int *word_upos = alloca(sizeof(int) * nunits); + build_unit_positions(as, word, nunits, word_upos); stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR); @@ -560,7 +642,7 @@ bool anysplit(Sentence sent, Gword *unsplit_word) gw = word; #endif - nsplits = split(lutf, as->nparts, &as->scl[lutf]); + nsplits = split(nunits, as->nparts, &as->scl[nunits]); if (0 == nsplits) { prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n"); @@ -595,20 +677,20 @@ bool anysplit(Sentence sent, Gword *unsplit_word) sample_point++; } - if (as->scl[lutf].p_tried[sample_point]) lgdebug(D_ANYS, "Sample: %d ", sample_point); + if (as->scl[nunits].p_tried[sample_point]) { lgdebug(D_ANYS+1, "(repeated)\n"); continue; } lgdebug(D_ANYS+1, "(new)"); rndtried++; - as->scl[lutf].p_tried[sample_point] = true; + as->scl[nunits].p_tried[sample_point] = true; /* The regexes in the affix file can be used to reject partitioning * that break graphemes. */ - if (morpheme_match(sent, word, lutf, &as->scl[lutf].sp[sample_point*as->nparts])) + if (morpheme_match(sent, word, nunits, word_upos, &as->scl[nunits].sp[sample_point*as->nparts])) { - as->scl[lutf].p_selected[sample_point] = true; + as->scl[nunits].p_selected[sample_point] = true; rndissued++; } else @@ -617,44 +699,31 @@ bool anysplit(Sentence sent, Gword *unsplit_word) } } - lgdebug(D_ANYS, "Results: word '%s' (utf-char=%zu utf-byte-length=%zu): %zu/%zu:\n", - word, lutf, l, rndissued, nsplits); + lgdebug(D_ANYS, "Results: word '%s' (units=%u byte-length=%zu): %zu/%zu:\n", + word, nunits, l, rndissued, nsplits); for (i = 0; i < nsplits; i++) { + size_t bos = 0, upos = 0; /* byte offset, codepoint offset */ const char **affixes = NULL; int num_sufixes; int num_affixes = 0; - if (!as->scl[lutf].p_selected[i]) continue; + if (!as->scl[nunits].p_selected[i]) continue; - pl = &as->scl[lutf].sp[i*as->nparts]; - bos = 0; - cpos = 0; - for (p = 0; p < as->nparts; p++) + p_list pl = &as->scl[nunits].sp[i*as->nparts]; + for (int p = 0; p < as->nparts; p++) { - size_t b = 0; - if (pl[0] == (int)lutf) /* This is the whole word */ - { - b = utf8_strncpy(affix, &word[bos], pl[p]-cpos); - affix[b] = '\0'; - } - else if (0 == cpos) /* The first, but not the only morpheme */ - { - b = utf8_strncpy(affix, &word[bos], pl[p]-cpos); - affix[b] = '\0'; - } - else /* 2nd and subsequent morphemes */ - { - b = utf8_strncpy(affix, &word[bos], pl[p]-cpos); - affix[b] = '\0'; - num_affixes++; - } - altappend(sent, &affixes, affix); + size_t b = word_upos[pl[p] - 1] - upos; + memcpy(affix, &word[bos], b); + affix[b] = '\0'; bos += b; - cpos = pl[p]; + altappend(sent, &affixes, affix); + if (bos == l) break; + upos = word_upos[pl[p] - 1]; + num_affixes++; } const char **prefix_position, **stem_position , **suffix_position; From c2957548899d9bca015a4f81a8bc21e0a8acc467 Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 12 Jul 2022 00:23:33 +0300 Subject: [PATCH 15/23] anysplit.c: Add #define atomic-unit instead of a hardcoded value --- data/amy/4.0.affix | 6 ++ link-grammar/tokenize/anysplit.c | 95 ++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 41 deletions(-) diff --git a/data/amy/4.0.affix b/data/amy/4.0.affix index b6cd8b06e5..bd0bd94e84 100644 --- a/data/amy/4.0.affix +++ b/data/amy/4.0.affix @@ -17,6 +17,12 @@ % Anysplit parameters +% A PCRE2 regex defining a character sequence that shouldn't get split. +% The LG library must be configured with PCRE2 in order to use it. If not, +% or if this definition is missing, a single utf8 codpoint is used as a +% byte sequence that should not get split. +#define atomic_unit "\X"; + % Maximum number of word partitions % FYI: 3 barely works, 4 and higher mostly do not work. % 6: REGPARTS+; diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 3a51debafc..06f84c484e 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -403,7 +403,8 @@ void free_anysplit(Dictionary afdict) free_regexs(as->regsuf); #if HAVE_PCRE2_H - gr_pcre2_free(&as->gr); + if (as->gr.pattern != NULL) + gr_pcre2_free(&as->gr); #endif // HAVE_PCRE2_H free(as); @@ -501,25 +502,32 @@ bool anysplit_init(Dictionary afdict) } #if HAVE_PCRE2_H - const char upat[] = "\\X"; - const char bpat[] = "^(?>"; - const char epat[] = "(.+)?)$"; - - // Build an optional match fore a single grapheme. - const unsigned int ubuf_strlen = strlen(upat) + /*()?*/3; - char *ubuf = alloca(ubuf_strlen + 1); - snprintf(ubuf, ubuf_strlen + 1, "(%s)?", upat); - - // Build a pattern to match all the graphemes in a word: "^(>(\\X)?...)$" - as->gr.pattern = - malloc(sizeof(bpat)-1 + ubuf_strlen * MAX_WORD_TO_SPLIT + sizeof(epat)); - strcpy(as->gr.pattern, bpat); - unsigned int n = strlen(as->gr.pattern); - for (i = 0; i < MAX_WORD_TO_SPLIT; i++, n+= ubuf_strlen) - strcpy(&as->gr.pattern[n], ubuf); - strcpy(&as->gr.pattern[n], epat); - - if (!gr_reg_comp(&as->gr)) return false; + const char *upat = linkgrammar_get_dict_define(afdict, "atomic-unit"); + if (upat == NULL) + { + as->gr.pattern = NULL; + } + else + { + const char bpat[] = "^(?>"; + const char epat[] = "(.+)?)$"; + + // Build an optional match for a single grapheme. + const unsigned int ubuf_strlen = strlen(upat) + /*()?*/3; + char *ubuf = alloca(ubuf_strlen + 1); + snprintf(ubuf, ubuf_strlen + 1, "(%s)?", upat); + + // Build a pattern to match all the graphemes in a word: "^(>(\\X)?...)$" + as->gr.pattern = + malloc(sizeof(bpat)-1 + ubuf_strlen * MAX_WORD_TO_SPLIT + sizeof(epat)); + strcpy(as->gr.pattern, bpat); + unsigned int n = strlen(as->gr.pattern); + for (i = 0; i < MAX_WORD_TO_SPLIT; i++, n+= ubuf_strlen) + strcpy(&as->gr.pattern[n], ubuf); + strcpy(&as->gr.pattern[n], epat); + + if (!gr_reg_comp(&as->gr)) return false; + } #endif // HAVE_PCRE2_H return true; @@ -528,19 +536,21 @@ bool anysplit_init(Dictionary afdict) /* * Return the number of units (codepoints or graphemes) in \p word. - * On error, return 0; + * Since \p word shouldn't be a null string, returned 0 means an error. */ static unsigned int strlen_units(anysplit_params *as, const char *word) { -#if !HAVE_PCRE2_H +#if HAVE_PCRE2_H + if (as->gr.pattern != NULL) + { + // Number of graphemes. + int rc = gr_reg_match(word, &as->gr); + if (rc <= 1) return 0; + return (unsigned int)(rc - 1); + } +#endif // HAVE_PCRE2_H // Number of codepoints. return (unsigned int)utf8_strlen(word); -#else // HAVE_PCRE2_H - // Number of graphemes. - int rc = gr_reg_match(word, &as->gr); - if (rc <= 1) return 0; - return (unsigned int)(rc - 1); -#endif // !HAVE_PCRE2_H } /** @@ -554,22 +564,25 @@ static void build_unit_positions(anysplit_params *as, const char *word, const unsigned int *word_pos_base = word_pos; #if HAVE_PCRE2_H - PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(as->gr.match_data); - - /* The first [start,end) is of the whole match (the word in this case). */ - for (unsigned int i = 1; i < nunits + 1; i++) - *word_pos++ = (unsigned int)ovector[2*i + 1]; - -#else - unsigned int pos = 0; - - for (unsigned int i = 0; word[i] != '\0'; i = pos) + if (as->gr.pattern != NULL) { - pos += utf8_charlen(&word[i]); - *word_pos++ = pos; - } + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(as->gr.match_data); + /* The first [start,end) is of the whole match (the word in this case). */ + for (unsigned int i = 1; i < nunits + 1; i++) + *word_pos++ = (unsigned int)ovector[2*i + 1]; + } + else #endif // HAVE_PCRE2_H + { + unsigned int pos = 0; + + for (unsigned int i = 0; word[i] != '\0'; i = pos) + { + pos += utf8_charlen(&word[i]); + *word_pos++ = pos; + } + } if (verbosity_level(D_ANYS+1)) { From 74a14684a614646f47c78baa969646286147dc87 Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 12 Jul 2022 00:46:04 +0300 Subject: [PATCH 16/23] amy/4.0.affix: Remove regexes for REG* No need for them after the grapheme-aware separation modification. --- data/amy/4.0.affix | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/data/amy/4.0.affix b/data/amy/4.0.affix index bd0bd94e84..4106fde1c1 100644 --- a/data/amy/4.0.affix +++ b/data/amy/4.0.affix @@ -53,16 +53,13 @@ % For ASCII input, the empty regexes can be used. % See the comments in 4.0.affix. -%"" : REGPRE+; -"^\X+$" : REGPRE+; +"" : REGPRE+; % Regex to match the middle parts. -%"" : REGMID+; -"^\X+$" : REGMID+; +"" : REGMID+; %".{2,}": REGMID+; % Regex to match the suffix. -%"" : REGSUF+; -"^\X+$" : REGSUF+; +"" : REGSUF+; % End of Anysplit parameters. From 54423ab0cf5114d4e5173c9fe47855f2bd5591ee Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 12 Jul 2022 00:56:01 +0300 Subject: [PATCH 17/23] amy/4.0.regex: Include trailing mark codepoints in atomic-unit This way morpheme candidates (split parts) are not starting with marks. This looks nicer and gives less splits. I don't know it is more useful. --- data/amy/4.0.affix | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data/amy/4.0.affix b/data/amy/4.0.affix index 4106fde1c1..15a8647e90 100644 --- a/data/amy/4.0.affix +++ b/data/amy/4.0.affix @@ -21,7 +21,8 @@ % The LG library must be configured with PCRE2 in order to use it. If not, % or if this definition is missing, a single utf8 codpoint is used as a % byte sequence that should not get split. -#define atomic_unit "\X"; +%#define atomic-unit "\X"; % split at grapheme boundaries. +#define atomic-unit "\X\pM*"; % ... but include trailing mark codepoints. % Maximum number of word partitions % FYI: 3 barely works, 4 and higher mostly do not work. From e236030a0498e2e4ec55ff6a77b3470f8019ee3d Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 12 Jul 2022 02:09:21 +0300 Subject: [PATCH 18/23] anysplit.c: Change p_end to unsigned int --- link-grammar/tokenize/anysplit.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 06f84c484e..735fd1ca70 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -51,10 +51,10 @@ extern const char * const afdict_classname[]; -typedef int p_end; /* partition end in a word (end char position + 1) */ -typedef p_end *p_list; /* list of partitions in a word */ +typedef unsigned int p_end; /* partition end in a word (end char position +1) */ +typedef p_end *p_list; /* list of partitions in a word */ -typedef struct split_cache /* split cached by word length */ +typedef struct split_cache /* split cached by word length */ { size_t nsplits; /* number of splits */ p_list sp; /* list of splits */ @@ -117,7 +117,7 @@ static void printps(int *ps, int n) } #endif -static void cache_partitions(p_list pl, int *ps, int p) +static void cache_partitions(p_list pl, unsigned int *ps, int p) { memcpy(pl, ps, sizeof(p_end) * p); } @@ -289,7 +289,7 @@ static bool morpheme_match(Sentence sent, const char *word, unsigned int nunits, return false; } - if (pl[p] == (int)nunits) break; + if (pl[p] == nunits) break; upos = word_upos[pl[p] - 1]; } From 82a97b0c995cfe96666bbf568a2a94e3362771bf Mon Sep 17 00:00:00 2001 From: ampli Date: Wed, 10 Aug 2022 10:33:28 +0300 Subject: [PATCH 19/23] any/affix_punc: Replace one-char affixes by [[:punct:]] --- data/any/affix-punc | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/data/any/affix-punc b/data/any/affix-punc index 65cddd9f55..19530ae416 100644 --- a/data/any/affix-punc +++ b/data/any/affix-punc @@ -1,13 +1,5 @@ -")" "}" "]" ">" » 〉 ) 〕 》 】 ] 』」 """ "’’" "’" ''.y '.y -"%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ?! ….y ....y "”" -_ - ‐ ‑ ‒ – — ― ~ ━ ー 、 -¢ ₵ ™ ℠ : RPUNC+; +"’’" ''.y ….y ....y "/[[:punct:]]$/.\0": RPUNC+; -"(" "{" "[" "<" « 〈 ( 〔 《 【 [ 『 「 """ `` „ “ ‘ ''.x '.x ….x ....x -¿ ¡ "$" -_ - ‐ ‑ ‒ – — ― ━ ー ~ -£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점 -† †† ‡ § ¶ © ® ℗ № "#": LPUNC+; +`` ''.x ….x ....x †† "/^[[:punct:]]/.\0": LPUNC+; --- ‒ – — ― - _ "(" ")" "[" "]" ... … "," ";" ":" -': MPUNC+; +-- ... … "/[[:punct:]]/.\0" ': MPUNC+; From ff92f6a392b6e92439ff4aeac8e0fc20d4fd8dc0 Mon Sep 17 00:00:00 2001 From: ampli Date: Wed, 10 Aug 2022 10:35:42 +0300 Subject: [PATCH 20/23] any/affix_punc: Add comments --- data/any/affix-punc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/data/any/affix-punc b/data/any/affix-punc index 19530ae416..849ff287e8 100644 --- a/data/any/affix-punc +++ b/data/any/affix-punc @@ -1,3 +1,11 @@ +% Affixes get stripped off the left and right side of words +% i.e. spaces are inserted between the affix and the word itself. + +% An LPUNC/RPUNC/MPUNC token can be specified as "/regex/.\N", when \N is +% the capture group that should match the affix (the whole pattern is +% capture group 0). Disregarding the position in which they appear, they +% are checked last - but in the same order. (Experimental.) + "’’" ''.y ….y ....y "/[[:punct:]]$/.\0": RPUNC+; `` ''.x ….x ....x †† "/^[[:punct:]]/.\0": LPUNC+; From 8b9f32eb8b22bfd646e5933fc6a0303c0427a333 Mon Sep 17 00:00:00 2001 From: ampli Date: Wed, 10 Aug 2022 10:43:17 +0300 Subject: [PATCH 21/23] amy/4.0.regex: Accept subscripted punctuation --- data/amy/4.0.regex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/amy/4.0.regex b/data/amy/4.0.regex index db822164bc..66b911b9ce 100644 --- a/data/amy/4.0.regex +++ b/data/amy/4.0.regex @@ -18,7 +18,7 @@ % along with many other punctuation characters that get strip from start % and end of words. See the "any/affix-punc" file. These punctuation % characters will match here. -ANY-PUNCT: /^[[:punct:]]+$/ +ANY-PUNCT: /^[[:punct:]]+(:?\x03|$)/ % Multi-part random morphology: match any string as prefix, stem, or suffix. % \x03 matches the internal representation of the dot in STEMSUBSCR From 9fb80f0a1d4e1c87a3fcf99274e954e053bcb198 Mon Sep 17 00:00:00 2001 From: ampli Date: Wed, 10 Aug 2022 10:55:39 +0300 Subject: [PATCH 22/23] amy/4.0.regex: Update the comments --- data/amy/4.0.regex | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/data/amy/4.0.regex b/data/amy/4.0.regex index 66b911b9ce..83367ab5ec 100644 --- a/data/amy/4.0.regex +++ b/data/amy/4.0.regex @@ -8,21 +8,19 @@ % The regexes here use the PCRE2 pattern syntax. % The LG library must be configured with PCRE2 in order to use them. -% \X matches any Unicode grapheme. -% Since most of the script-specific punctuation characters are not in -% the affix-punc file, they are allowed here to join to the end word/parts +% \X matches any Unicode grapheme. \x03 matches the internal representation +% of the dot in STEMSUBSCR (See 4.0.affix). % % For information on graphemes see: http://www.unicode.org/reports/tr29/ -% Hyphenated words, contractions, and words with underbars in them, get split, -% along with many other punctuation characters that get strip from start -% and end of words. See the "any/affix-punc" file. These punctuation -% characters will match here. +% Punctuation characters are getting strip from start and end of words, +% and words that contain punctuation are getting split at them. See the +% "any/affix-punc" file. +% These punctuation characters will match here. The \x03 is to match +% subscripted punctuation that may be specified in this file. ANY-PUNCT: /^[[:punct:]]+(:?\x03|$)/ % Multi-part random morphology: match any string as prefix, stem, or suffix. -% \x03 matches the internal representation of the dot in STEMSUBSCR -% (See 4.0.affix). MOR-STEM: /^\X+\x03=$/ MOR-PREF: /^\X+=$/ @@ -36,7 +34,7 @@ ANY-WORD: /^[^[:punct:]]+$/ % For ASCII input and non-PCRE2 regex libraries you can use these instead: % ANY-WORD: /^[[:alnum:]]+$/ -% ANY-PUNCT: /^[[:punct:]]+$/ +% ANY-PUNCT: /^[[:punct:]].*$/ % The .* is to match an optional subscript. % MOR-PREF: /^[[:alnum:]]+=$/ % MOR-STEM: /^[[:alnum:]]+.=$/ % MOR-SUFF: /^=[[:alnum:]]+$/ From 56247cb71840cdb6338a9ee3abd2dc76575ae760 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 12 Aug 2022 17:31:35 +0300 Subject: [PATCH 23/23] afdict_init(): Validate affixes w/dictionary_word_is_known() ...instead of dict_has_word(), to allow punctuation that match a regex. --- link-grammar/dict-common/dict-impl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c index 8f97034331..1e3d83448b 100644 --- a/link-grammar/dict-common/dict-impl.c +++ b/link-grammar/dict-common/dict-impl.c @@ -805,7 +805,7 @@ bool afdict_init(Dictionary dict) for (int n = 0; n < ac->length - ac->Nregexes; n++) { - if (!dict_has_word(dict, ac->string[n])) + if (!dictionary_word_is_known(dict, ac->string[n])) { if (!not_in_dict) {