From 5825dd223121ec0a0c83d297124b8c93741dddf5 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Fri, 8 Jul 2022 19:49:20 +0300
Subject: [PATCH 01/23] amy/4.0.affix,amy/4.0.regex: Simplify the regexes

This doesn't work yet for splitting on grapheme boundaries, because ^X
matches at leas one codepoint so it matches a split initial morpheme in a part.

This change is needed for the upcoming new code to split at grapheme
boundaries.
---
 data/amy/4.0.affix |  6 +++---
 data/amy/4.0.regex | 38 +++++++++++++++++++-------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/data/amy/4.0.affix b/data/amy/4.0.affix
index eb773751d0..b6cd8b06e5 100644
--- a/data/amy/4.0.affix
+++ b/data/amy/4.0.affix
@@ -48,15 +48,15 @@
 % See the comments in 4.0.affix.
 
 %"" : REGPRE+;
-"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGPRE+;
+"^\X+$" : REGPRE+;
 
 % Regex to match the middle parts.
 %"" : REGMID+;
-"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGMID+;
+"^\X+$" : REGMID+;
 %".{2,}": REGMID+;
 
 % Regex to match the suffix.
 %"" : REGSUF+;
-"^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$" : REGSUF+;
+"^\X+$" : REGSUF+;
 
 % End of Anysplit parameters.
diff --git a/data/amy/4.0.regex b/data/amy/4.0.regex
index 3129716434..db822164bc 100644
--- a/data/amy/4.0.regex
+++ b/data/amy/4.0.regex
@@ -9,37 +9,37 @@
 % The LG library must be configured with PCRE2 in order to use them.
 
 % \X matches any Unicode grapheme.
-% (?:(?=\p{Xan}) specifies that it should start with a letter or number.
-% Similarly, \pM allows it to start with a mark character.
 % Since most of the script-specific punctuation characters are not in
 % the affix-punc file, they are allowed here to join to the end word/parts
-% Most probably these regexes still reject valid word graphemes in some languages.
 %
 % For information on graphemes see: http://www.unicode.org/reports/tr29/
 
-% Want to match apostrophes, for abbreviations (I'm I've, etc.) since
-% these cannot be auto-split with the current splitter.
-% Hyphenated words, and words with underbars in them, get split.
-ANY-WORD:  /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*$/
+% Hyphenated words, contractions, and words with underbars in them, get split,
+% along with many other punctuation characters that get strip from start
+% and end of words. See the "any/affix-punc" file. These punctuation
+% characters will match here.
 ANY-PUNCT: /^[[:punct:]]+$/
 
-% Multi-part random morphology: match any string as prefix, stem, or
-% suffix.
+% Multi-part random morphology: match any string as prefix, stem, or suffix.
 % \x03 matches the internal representation of the dot in STEMSUBSCR
 % (See 4.0.affix).
 
-MOR-STEM: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*\x03=$/
-MOR-PREF: /^(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*=$/
-MOR-SUFF: /^=(?=\p{Xan})\X(?:(?=\p{Xan}|\pM|\p{Po})\X)*/
+MOR-STEM: /^\X+\x03=$/
+MOR-PREF: /^\X+=$/
+MOR-SUFF: /^=\X+/
 
-% For ASCII input, the following is enough (and it works even if the
-% LG library is configured with a regex library other then PCRE2).
-% To use it, uncomment it out and comment out the previous definitions.
-% ANY-WORD: /^[[:alnum:]']+$/
+% Reject anything that contains punctuation, so that the tokenizer will
+% have a chance to split them off as affixes.
+% Most of the script-dependent punctuation characters are not mentioned in
+% the "any/affix-punc" file and thus may be included in words.
+ANY-WORD: /^[^[:punct:]]+$/
+
+% For ASCII input and non-PCRE2 regex libraries you can use these instead:
+% ANY-WORD: /^[[:alnum:]]+$/
 % ANY-PUNCT: /^[[:punct:]]+$/
-% MOR-PREF: /^[[:alnum:]']+=$/
-% MOR-STEM: /^[[:alnum:]']+.=$/
-% MOR-SUFF: /^=[[:alnum:]']+$/
+% MOR-PREF: /^[[:alnum:]]+=$/
+% MOR-STEM: /^[[:alnum:]]+.=$/
+% MOR-SUFF: /^=[[:alnum:]]+$/
 
 % Match anything that doesn't match the above.
 % Match anything that isn't white-space.

From 36901d055ddd978cab73e1292118754c74cf9c2f Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Sat, 9 Jul 2022 02:04:01 +0300
Subject: [PATCH 02/23] anysplit(): Move the sanity checks to the start

---
 link-grammar/tokenize/anysplit.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 02593939bd..6f6d2d750c 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -433,9 +433,12 @@ bool anysplit_init(Dictionary afdict)
 #define D_AS 5
 bool anysplit(Sentence sent, Gword *unsplit_word)
 {
-	const char * word = unsplit_word->subword;
 	Dictionary afdict = sent->dict->affix_table;
-	anysplit_params *as;
+	if (NULL == afdict) return false;
+	anysplit_params * as = afdict->anysplit;
+	if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */
+
+	const char * word = unsplit_word->subword;
 	Afdict_class * stemsubscr;
 
 	size_t l = strlen(word);
@@ -452,11 +455,6 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 	char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */
 	bool use_sampling = true;
 
-	if (NULL == afdict) return false;
-	as = afdict->anysplit;
-
-	if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */
-
 	if (lutf > MAX_WORD_TO_SPLIT)
 	{
 		Gword *alt = issue_word_alternative(sent, unsplit_word, "AS>",

From f3c7d8acaec134379cdf71d40f08a1c9e81e5ef0 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Sat, 9 Jul 2022 02:01:34 +0300
Subject: [PATCH 03/23] free_anysplit(): Move it to be near its usage

---
 link-grammar/tokenize/anysplit.c | 41 ++++++++++++++++----------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 6f6d2d750c..da2acbf038 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -187,27 +187,6 @@ static int split_and_cache(int word_length, int nparts, split_cache *scl)
 	return maxindex+1;
 }
 
-void free_anysplit(Dictionary afdict)
-{
-	size_t i;
-	anysplit_params *as = afdict->anysplit;
-
-	if (NULL == as) return;
-
-	for (i = 0; i < ARRAY_SIZE(as->scl); i++)
-	{
-		if (NULL == as->scl[i].sp) continue;
-		free(as->scl[i].sp);
-		free(as->scl[i].p_selected);
-		free(as->scl[i].p_tried);
-	}
-	free_regexs(as->regpre);
-	free_regexs(as->regmid);
-	free_regexs(as->regsuf);
-	free(as);
-	afdict->anysplit = NULL;
-}
-
 /*
  * Returns: Number of splits.
  */
@@ -327,6 +306,26 @@ static Regex_node * regbuild(const char **regstring, int n, int classnum)
 	return regex_root;
 }
 
+void free_anysplit(Dictionary afdict)
+{
+	size_t i;
+	anysplit_params *as = afdict->anysplit;
+
+	if (NULL == as) return;
+
+	for (i = 0; i < ARRAY_SIZE(as->scl); i++)
+	{
+		if (NULL == as->scl[i].sp) continue;
+		free(as->scl[i].sp);
+		free(as->scl[i].p_selected);
+		free(as->scl[i].p_tried);
+	}
+	free_regexs(as->regpre);
+	free_regexs(as->regmid);
+	free_regexs(as->regsuf);
+	free(as);
+	afdict->anysplit = NULL;
+}
 
 /**
  * Affix classes:

From 8df81c591d2c0c32cfbdc3954c8e89080a93d8e9 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Sat, 9 Jul 2022 02:28:37 +0300
Subject: [PATCH 04/23] morpheme_match(): Rename prefix_string to word_part

---
 link-grammar/tokenize/anysplit.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index da2acbf038..e754378966 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -245,13 +245,13 @@ static bool morpheme_match(Sentence sent,
 	int p;
 	Regex_node *re;
 	size_t blen = strlen(word);
-	char *prefix_string = alloca(blen+1);
+	char *word_part = alloca(blen+1);
 
 	lgdebug(+D_MM, "word=%s: ", word);
 	for (p = 0; p < as->nparts; p++)
 	{
-		size_t b = utf8_strncpy(prefix_string, &word[bos], pl[p]-cpos);
-		prefix_string[b] = '\0';
+		size_t b = utf8_strncpy(word_part, &word[bos], pl[p]-cpos);
+		word_part[b] = '\0';
 		bos += b;
 
 		/* For flexibility, REGRPE is matched only to the prefix part,
@@ -260,10 +260,10 @@ static bool morpheme_match(Sentence sent,
 		if (0 == p) re = as->regpre;
 		else if (pl[p] == (int) lutf) re = as->regsuf;
 		else re = as->regmid;
-		lgdebug(D_MM, "re=%s part%d=%s: ", re?re->name:"(nil)", p, prefix_string);
+		lgdebug(D_MM, "re=%s part%d=%s: ", re?re->name:"(nil)", p, word_part);
 
 		/* A NULL regex always matches */
-		if ((NULL != re) && (NULL == match_regex(re, prefix_string)))
+		if ((NULL != re) && (NULL == match_regex(re, word_part)))
 		{
 			lgdebug(D_MM, "No match\n");
 			return false;

From b22706c736bce2af4300db63ad17d9298187cbaa Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Sat, 9 Jul 2022 02:30:41 +0300
Subject: [PATCH 05/23] anysplit(): Remove commented-out code line

---
 link-grammar/tokenize/anysplit.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index e754378966..602fe8f27c 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -570,7 +570,6 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 
 			bos += b;
 			cpos = pl[p];
-			// if (cpos == lutf) break; /* Same thing as below...*/
 			if (bos == l) break;
 		}
 

From 97fd0c790fc95271c335821e4e45fc6a53de076d Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Sun, 10 Jul 2022 00:33:07 +0300
Subject: [PATCH 06/23] anysplit,c: Fix a comment rot

---
 link-grammar/tokenize/anysplit.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 602fe8f27c..c153330593 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -43,7 +43,9 @@
 
 #define MAX_WORD_TO_SPLIT 63 /* in codepoints */
 
-typedef int p_start;     /* partition start in a word */
+extern const char * const afdict_classname[];
+
+typedef int p_start;     /* partition end in a word (end char position + 1) */
 typedef p_start *p_list; /* list of partitions in a word */
 
 typedef struct split_cache /* split cached by word length */

From 73eaff66885748f1fbd5669c3e5b39062a068c69 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 11 Jul 2022 16:42:55 +0300
Subject: [PATCH 07/23] anysplit.c: Rename p_start to p_end

---
 link-grammar/tokenize/anysplit.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index c153330593..2740054358 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -45,8 +45,8 @@
 
 extern const char * const afdict_classname[];
 
-typedef int p_start;     /* partition end in a word (end char position + 1) */
-typedef p_start *p_list; /* list of partitions in a word */
+typedef int p_end;     /* partition end in a word (end char position + 1) */
+typedef p_end *p_list; /* list of partitions in a word */
 
 typedef struct split_cache /* split cached by word length */
 {
@@ -102,7 +102,7 @@ static void printps(int *ps, int n)
 
 static void cache_partitions(p_list pl, int *ps, int p)
 {
-	memcpy(pl, ps, sizeof(p_start) * p);
+	memcpy(pl, ps, sizeof(p_end) * p);
 }
 
 	/* p = 5      */
@@ -126,7 +126,7 @@ static int split_and_cache(int word_length, int nparts, split_cache *scl)
 
 	int n;
 	int maxindex;
-	p_list ps = alloca(sizeof(p_start)*nparts); /* partition start */
+	p_list ps = alloca(sizeof(p_end)*nparts);
 
 	if (0 == word_length) return 0;
 
@@ -206,7 +206,7 @@ static int split(int word_length, int nparts, split_cache *scl)
 				word_length, nparts);
 			return 0;
 		}
-		scl->sp = malloc(sizeof(p_start)*nparts * nsplits);
+		scl->sp = malloc(sizeof(p_end)*nparts * nsplits);
 		scl->p_selected = malloc(sizeof(*(scl->p_selected)) * nsplits);
 		scl->p_tried = malloc(sizeof(*(scl->p_tried)) * nsplits);
 		split_and_cache(word_length, nparts, scl);
@@ -260,7 +260,7 @@ static bool morpheme_match(Sentence sent,
 		 * REGMID only to the middle suffixes, and REGSUF only to the
 		 * suffix part - which cannot be the prefix. */
 		if (0 == p) re = as->regpre;
-		else if (pl[p] == (int) lutf) re = as->regsuf;
+		else if (pl[p] == (p_end)lutf) re = as->regsuf;
 		else re = as->regmid;
 		lgdebug(D_MM, "re=%s part%d=%s: ", re?re->name:"(nil)", p, word_part);
 

From af8e4816f496819f3fdb3e2044e84d54ed496f74 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Sun, 10 Jul 2022 01:28:27 +0300
Subject: [PATCH 08/23] morpheme_match(): Update description

---
 link-grammar/tokenize/anysplit.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 2740054358..81a01004cd 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -234,8 +234,9 @@ static int rng_uniform(unsigned int *seedp, size_t nsplits)
 
 }
 
-/* lutf is the length of the string, measured in code-points,
- * blen is the length of the string, measured in bytes.
+/**
+ * Match the \p word parts \p pl to REGPRE, REGMID, and REGSUF.
+ * All the parts must match.
  */
 #define D_MM 7
 static bool morpheme_match(Sentence sent,

From 1576152038d5a4f18587b63831eeb76d030a5eda Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 11 Jul 2022 01:50:10 +0300
Subject: [PATCH 09/23] anysplit.c: Define D_ANYS as the verbosity level for
 this file

---
 link-grammar/tokenize/anysplit.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 81a01004cd..ce49232977 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -9,6 +9,8 @@
 /*                                                                       */
 /*************************************************************************/
 
+#define D_ANYS 5               /* Debug level for this file (6 for more) */
+
 /**
  * anysplit.c -- code that splits words into random morphemes.
  * This is used for the language-learning/morpheme-learning project.
@@ -432,7 +434,6 @@ bool anysplit_init(Dictionary afdict)
  * - an error occurs (the behavior then is undefined).
  *   Such an error has not been observed yet.
  */
-#define D_AS 5
 bool anysplit(Sentence sent, Gword *unsplit_word)
 {
 	Dictionary afdict = sent->dict->affix_table;
@@ -493,7 +494,7 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 		use_sampling = false;
 	}
 
-	lgdebug(+D_AS, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, "
+	lgdebug(+D_ANYS, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, "
 	        "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no",
 	        word, nsplits, as->nparts, as->altsmin, as->altsmax);
 
@@ -514,13 +515,13 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 			sample_point++;
 		}
 
-		lgdebug(D_AS, "Sample: %d ", sample_point);
 		if (as->scl[lutf].p_tried[sample_point])
+		lgdebug(D_ANYS, "Sample: %d ", sample_point);
 		{
-			lgdebug(D_AS+1, "(repeated)\n");
+			lgdebug(D_ANYS+1, "(repeated)\n");
 			continue;
 		}
-		lgdebug(D_AS+1, "(new)");
+		lgdebug(D_ANYS+1, "(new)");
 		rndtried++;
 		as->scl[lutf].p_tried[sample_point] = true;
 		/* The regexes in the affix file can be used to reject partitioning
@@ -532,11 +533,11 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 		}
 		else
 		{
-			lgdebug(D_AS, "\n");
+			lgdebug(D_ANYS, "\n");
 		}
 	}
 
-	lgdebug(D_AS, "Results: word '%s' (utf-char=%zu utf-byte-length=%zu): %zu/%zu:\n",
+	lgdebug(D_ANYS, "Results: word '%s' (utf-char=%zu utf-byte-length=%zu): %zu/%zu:\n",
 	        word, lutf, l, rndissued, nsplits);
 
 	for (i = 0; i < nsplits; i++)
@@ -622,4 +623,3 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 	if (0 != sent->rand_state) sent->rand_state = seed;
 	return true;
 }
-#undef D_AS

From 128fccdc216a04dc19aad314846ca7f10cccbfe0 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 11 Jul 2022 22:12:29 +0300
Subject: [PATCH 10/23] anysplit(): Move 0 length check to the start

---
 link-grammar/tokenize/anysplit.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index ce49232977..05897f4e12 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -444,7 +444,6 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 	const char * word = unsplit_word->subword;
 	Afdict_class * stemsubscr;
 
-	size_t l = strlen(word);
 	size_t lutf = utf8_strlen(word);
 	p_list pl;
 	size_t bos, cpos; /* byte offset, codepoint offset */
@@ -455,10 +454,18 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 	size_t rndissued = 0;
 	size_t i;
 	unsigned int seed = sent->rand_state;
-	char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */
 	bool use_sampling = true;
 
+	size_t l = strlen(word);
+	char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */
+
 	if (lutf > MAX_WORD_TO_SPLIT)
+	if (0 == l)
+	{
+		prt_error("Warning: anysplit(): word length 0\n");
+		return false;
+	}
+
 	{
 		Gword *alt = issue_word_alternative(sent, unsplit_word, "AS>",
 		                       0,NULL, 1,&word, 0,NULL);

From 53dd29f144c070924cfc8a8aeeb067d66bf6c4e6 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 11 Jul 2022 22:14:20 +0300
Subject: [PATCH 11/23] anysplit.c: Include pcre2.h

---
 link-grammar/tokenize/anysplit.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 05897f4e12..a46a87ade7 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -29,6 +29,10 @@
 #include <stdbool.h>
 #include <errno.h>
 #include <time.h>
+#if HAVE_PCRE2_H // use it to split in grapheme boundaries
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
+#endif // HAVE_PCRE2_H
 
 #include "api-structures.h"
 #include "dict-common/dict-affix.h"

From aa3e7f1dffc12582e1f652f851f5fa540f8d9ebe Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 11 Jul 2022 22:16:29 +0300
Subject: [PATCH 12/23] anysplit.c: Add data structure for grapheme separation

---
 link-grammar/tokenize/anysplit.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index a46a87ade7..93bf157008 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -47,7 +47,7 @@
 #include "anysplit.h"
 
 
-#define MAX_WORD_TO_SPLIT 63 /* in codepoints */
+#define MAX_WORD_TO_SPLIT 63 /* in codepoints (or graphemes if HAVE_PCRE2_H) */
 
 extern const char * const afdict_classname[];
 
@@ -62,12 +62,23 @@ typedef struct split_cache /* split cached by word length */
 	bool *p_selected;    /* list of selected splits */
 } split_cache;
 
+#if HAVE_PCRE2_H
+typedef struct {
+	char *pattern;
+	pcre2_code *code;
+	pcre2_match_data* match_data;
+} grapheme_regex;
+#endif
+
 typedef struct anysplit_params
 {
 	int nparts;                /* maximum number of suffixes to split to */
 	size_t altsmin;            /* minimum number of alternatives to generate */
 	size_t altsmax;            /* maximum number of alternatives to generate */
 	Regex_node *regpre, *regmid, *regsuf; /* issue matching combinations  */
+#if HAVE_PCRE2_H
+	grapheme_regex gr;
+#endif // HAVE_PCRE2_H
 	split_cache scl[MAX_WORD_TO_SPLIT+1]; /* split cache according to word length */
 } anysplit_params;
 

From 3193e89ea67e0edd04e9994a291ed18117f489e4 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 11 Jul 2022 22:21:22 +0300
Subject: [PATCH 13/23] anysplit.c: Add functions for grapheme separation

---
 link-grammar/tokenize/anysplit.c | 58 ++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 93bf157008..4929e44dc1 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -326,6 +326,64 @@ static Regex_node * regbuild(const char **regstring, int n, int classnum)
 	return regex_root;
 }
 
+#if HAVE_PCRE2_H
+static bool gr_reg_comp(grapheme_regex *re)
+{
+	PCRE2_SIZE erroffset;
+	int rc;
+
+	re->code = pcre2_compile((PCRE2_SPTR)re->pattern, PCRE2_ZERO_TERMINATED,
+	                         PCRE2_UTF, &rc, &erroffset, NULL);
+	if (re->code != NULL)
+	{
+		re->match_data = pcre2_match_data_create_from_pattern(re->code, NULL);
+		if (re->match_data == NULL)
+		{
+			prt_error("Error: pcre2_match_data_create_from_pattern() failed\n");
+			pcre2_code_free(re->code);
+			return false;
+		}
+		return true;
+	}
+
+	/* We have an error. */
+#define ERRBUFFLEN 120
+	PCRE2_UCHAR errbuf[ERRBUFFLEN];
+	pcre2_get_error_message(rc, errbuf, ERRBUFFLEN);
+	prt_error("Error: Failed to compile grapheme regex \"%s\": %s (code %d) at %d\n",
+	          re->pattern, errbuf, rc, (int)erroffset);
+	return false;
+}
+
+static int gr_reg_match(const char *word, grapheme_regex *re)
+{
+	int rc = pcre2_match(re->code, (PCRE2_SPTR)word,
+	                     PCRE2_ZERO_TERMINATED, /*startoffset*/0,
+	                     PCRE2_NO_UTF_CHECK, re->match_data, NULL);
+	if (rc == PCRE2_ERROR_NOMATCH) return rc;
+	if (rc > 0) return rc;
+	if (rc == 0)
+	{
+		prt_error("Error: pcre2_match(): ovector: Internal error\"\n");
+		return rc;
+	}
+
+	/* We have an error. */
+	PCRE2_UCHAR errbuf[ERRBUFFLEN];
+	pcre2_get_error_message(rc, errbuf, ERRBUFFLEN);
+	prt_error("Error: pcre2_match(): \"%s\": %s (code %d)\n",
+	          re->pattern, errbuf, rc);
+	return rc;
+}
+
+static void gr_pcre2_free(grapheme_regex *re)
+{
+	free(re->pattern);
+	pcre2_match_data_free(re->match_data);
+	pcre2_code_free(re->code);
+}
+#endif // HAVE_PCRE2_H
+
 void free_anysplit(Dictionary afdict)
 {
 	size_t i;

From cf6a49818b25a0eaeb4e88d9b53757fe5d8aed35 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 11 Jul 2022 22:37:50 +0300
Subject: [PATCH 14/23] anysplit.c: Add ability to split on grapheme boundaries

---
 link-grammar/tokenize/anysplit.c | 175 +++++++++++++++++++++----------
 1 file changed, 122 insertions(+), 53 deletions(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 4929e44dc1..3a51debafc 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -256,21 +256,21 @@ static int rng_uniform(unsigned int *seedp, size_t nsplits)
  * All the parts must match.
  */
 #define D_MM 7
-static bool morpheme_match(Sentence sent,
-	const char *word, size_t lutf, p_list pl)
+static bool morpheme_match(Sentence sent, const char *word, unsigned int nunits,
+                           unsigned int *word_upos, p_list pl)
 {
 	Dictionary afdict = sent->dict->affix_table;
 	anysplit_params *as = afdict->anysplit;
-	size_t bos = 0, cpos = 0; /* byte offset, code-point offset */
-	int p;
-	Regex_node *re;
-	size_t blen = strlen(word);
-	char *word_part = alloca(blen+1);
+	char *word_part = alloca(strlen(word) + 1);
 
 	lgdebug(+D_MM, "word=%s: ", word);
-	for (p = 0; p < as->nparts; p++)
+	for (int p = 0; p < as->nparts; p++)
 	{
-		size_t b = utf8_strncpy(word_part, &word[bos], pl[p]-cpos);
+		size_t bos = 0, upos = 0; /* word offset, unit offset (both in bytes) */
+		size_t b = word_upos[pl[p] - 1] - upos;
+		Regex_node *re;
+
+		memcpy(word_part, &word[bos], b);
 		word_part[b] = '\0';
 		bos += b;
 
@@ -278,7 +278,7 @@ static bool morpheme_match(Sentence sent,
 		 * REGMID only to the middle suffixes, and REGSUF only to the
 		 * suffix part - which cannot be the prefix. */
 		if (0 == p) re = as->regpre;
-		else if (pl[p] == (p_end)lutf) re = as->regsuf;
+		else if (pl[p] == (p_end)nunits) re = as->regsuf;
 		else re = as->regmid;
 		lgdebug(D_MM, "re=%s part%d=%s: ", re?re->name:"(nil)", p, word_part);
 
@@ -289,8 +289,8 @@ static bool morpheme_match(Sentence sent,
 			return false;
 		}
 
-		cpos = pl[p];
-		if (cpos == lutf) break;
+		if (pl[p] == (int)nunits) break;
+		upos = word_upos[pl[p] - 1];
 	}
 
 	lgdebug(D_MM, "Match\n");
@@ -401,6 +401,11 @@ void free_anysplit(Dictionary afdict)
 	free_regexs(as->regpre);
 	free_regexs(as->regmid);
 	free_regexs(as->regsuf);
+
+#if HAVE_PCRE2_H
+	gr_pcre2_free(&as->gr);
+#endif // HAVE_PCRE2_H
+
 	free(as);
 	afdict->anysplit = NULL;
 }
@@ -495,10 +500,93 @@ bool anysplit_init(Dictionary afdict)
 		return false;
 	}
 
+#if HAVE_PCRE2_H
+	const char upat[] = "\\X";
+	const char bpat[] = "^(?>";
+	const char epat[] = "(.+)?)$";
+
+	// Build an optional match fore a single grapheme.
+	const unsigned int ubuf_strlen = strlen(upat) + /*()?*/3;
+	char *ubuf = alloca(ubuf_strlen + 1);
+	snprintf(ubuf, ubuf_strlen + 1, "(%s)?", upat);
+
+	// Build a pattern to match all the graphemes in a word: "^(>(\\X)?...)$"
+	as->gr.pattern =
+		malloc(sizeof(bpat)-1 + ubuf_strlen * MAX_WORD_TO_SPLIT + sizeof(epat));
+	strcpy(as->gr.pattern, bpat);
+	unsigned int n = strlen(as->gr.pattern);
+	for (i = 0; i < MAX_WORD_TO_SPLIT; i++, n+= ubuf_strlen)
+		strcpy(&as->gr.pattern[n], ubuf);
+	strcpy(&as->gr.pattern[n], epat);
+
+	if (!gr_reg_comp(&as->gr)) return false;
+#endif // HAVE_PCRE2_H
+
 	return true;
 }
 #undef D_AI
 
+/*
+ * Return the number of units (codepoints or graphemes) in \p word.
+ * On error, return 0;
+ */
+static unsigned int strlen_units(anysplit_params *as, const char *word)
+{
+#if !HAVE_PCRE2_H
+	// Number of codepoints.
+	return (unsigned int)utf8_strlen(word);
+#else // HAVE_PCRE2_H
+	// Number of graphemes.
+	int rc = gr_reg_match(word, &as->gr);
+	if (rc <= 1) return 0;
+	return (unsigned int)(rc - 1);
+#endif // !HAVE_PCRE2_H
+}
+
+/**
+ * Set the elements of \p word_pos (containing \p nunits elements) to the
+ * end positions (last char position + 1) of the atomic units in \p word.
+ */
+static void	build_unit_positions(anysplit_params *as, const char *word,
+											unsigned int nunits, unsigned int *word_pos)
+{
+	 dassert(nunits != 0, "At least one atomic unit is expected");
+	 const unsigned int *word_pos_base = word_pos;
+
+#if HAVE_PCRE2_H
+	PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(as->gr.match_data);
+
+	/* The first [start,end) is of the whole match (the word in this case). */
+	for (unsigned int i = 1; i < nunits + 1; i++)
+		*word_pos++ = (unsigned int)ovector[2*i + 1];
+
+#else
+	unsigned int pos = 0;
+
+	for (unsigned int i = 0; word[i] != '\0'; i = pos)
+	{
+		pos += utf8_charlen(&word[i]);
+		*word_pos++ = pos;
+	}
+
+#endif // HAVE_PCRE2_H
+
+	if (verbosity_level(D_ANYS+1))
+	{
+		unsigned int bos = 0;
+
+		prt_error("Debug: %u atomic units:\n\\", nunits);
+		for (unsigned int i = 0; i < nunits; i ++)
+		{
+			prt_error("%u) %.*s\n\\", i+1,(int)(word_pos_base[i]-bos), &word[bos]);
+			bos = word_pos_base[i];
+		}
+		prt_error("\n");
+
+	}
+	dassert(word_pos[-1] == strlen(word), "Inconsistent word end");
+}
+
 /**
  * Split randomly.
  * Return true on success.
@@ -517,10 +605,6 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 	const char * word = unsplit_word->subword;
 	Afdict_class * stemsubscr;
 
-	size_t lutf = utf8_strlen(word);
-	p_list pl;
-	size_t bos, cpos; /* byte offset, codepoint offset */
-	int p;
 	int sample_point;
 	size_t nsplits;
 	size_t rndtried = 0;
@@ -528,17 +612,18 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 	size_t i;
 	unsigned int seed = sent->rand_state;
 	bool use_sampling = true;
+	unsigned int nunits = strlen_units(as, word);
 
 	size_t l = strlen(word);
 	char *affix = alloca(l+2+1); /* word + ".=" + NUL: Max. affix length */
 
-	if (lutf > MAX_WORD_TO_SPLIT)
 	if (0 == l)
 	{
 		prt_error("Warning: anysplit(): word length 0\n");
 		return false;
 	}
 
+	if ((nunits > MAX_WORD_TO_SPLIT) || (nunits == 0))
 	{
 		Gword *alt = issue_word_alternative(sent, unsplit_word, "AS>",
 		                       0,NULL, 1,&word, 0,NULL);
@@ -546,11 +631,8 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 		return true;
 	}
 
-	if (0 == l)
-	{
-		prt_error("Warning: anysplit(): word length 0\n");
-		return false;
-	}
+	unsigned int *word_upos = alloca(sizeof(int) * nunits);
+	build_unit_positions(as, word, nunits, word_upos);
 
 	stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR);
 
@@ -560,7 +642,7 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 	gw = word;
 #endif
 
-	nsplits = split(lutf, as->nparts, &as->scl[lutf]);
+	nsplits = split(nunits, as->nparts, &as->scl[nunits]);
 	if (0 == nsplits)
 	{
 		prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n");
@@ -595,20 +677,20 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 			sample_point++;
 		}
 
-		if (as->scl[lutf].p_tried[sample_point])
 		lgdebug(D_ANYS, "Sample: %d ", sample_point);
+		if (as->scl[nunits].p_tried[sample_point])
 		{
 			lgdebug(D_ANYS+1, "(repeated)\n");
 			continue;
 		}
 		lgdebug(D_ANYS+1, "(new)");
 		rndtried++;
-		as->scl[lutf].p_tried[sample_point] = true;
+		as->scl[nunits].p_tried[sample_point] = true;
 		/* The regexes in the affix file can be used to reject partitioning
 		 * that break graphemes. */
-		if (morpheme_match(sent, word, lutf, &as->scl[lutf].sp[sample_point*as->nparts]))
+		if (morpheme_match(sent, word, nunits, word_upos, &as->scl[nunits].sp[sample_point*as->nparts]))
 		{
-			as->scl[lutf].p_selected[sample_point] = true;
+			as->scl[nunits].p_selected[sample_point] = true;
 			rndissued++;
 		}
 		else
@@ -617,44 +699,31 @@ bool anysplit(Sentence sent, Gword *unsplit_word)
 		}
 	}
 
-	lgdebug(D_ANYS, "Results: word '%s' (utf-char=%zu utf-byte-length=%zu): %zu/%zu:\n",
-	        word, lutf, l, rndissued, nsplits);
+	lgdebug(D_ANYS, "Results: word '%s' (units=%u byte-length=%zu): %zu/%zu:\n",
+	        word, nunits, l, rndissued, nsplits);
 
 	for (i = 0; i < nsplits; i++)
 	{
+		size_t bos = 0, upos = 0; /* byte offset, codepoint offset */
 		const char **affixes = NULL;
 		int num_sufixes;
 		int num_affixes = 0;
 
-		if (!as->scl[lutf].p_selected[i]) continue;
+		if (!as->scl[nunits].p_selected[i]) continue;
 
-		pl = &as->scl[lutf].sp[i*as->nparts];
-		bos = 0;
-		cpos = 0;
-		for (p = 0; p < as->nparts; p++)
+		p_list pl = &as->scl[nunits].sp[i*as->nparts];
+		for (int p = 0; p < as->nparts; p++)
 		{
-			size_t b = 0;
-			if (pl[0] == (int)lutf)  /* This is the whole word */
-			{
-				b = utf8_strncpy(affix, &word[bos], pl[p]-cpos);
-				affix[b] = '\0';
-			}
-			else if (0 == cpos)   /* The first, but not the only morpheme */
-			{
-				b = utf8_strncpy(affix, &word[bos], pl[p]-cpos);
-				affix[b] = '\0';
-			}
-			else           /* 2nd and subsequent morphemes */
-			{
-				b = utf8_strncpy(affix, &word[bos], pl[p]-cpos);
-				affix[b] = '\0';
-				num_affixes++;
-			}
-			altappend(sent, &affixes, affix);
+			size_t b = word_upos[pl[p] - 1] - upos;
 
+			memcpy(affix, &word[bos], b);
+			affix[b] = '\0';
 			bos += b;
-			cpos = pl[p];
+			altappend(sent, &affixes, affix);
+
 			if (bos == l) break;
+			upos = word_upos[pl[p] - 1];
+			num_affixes++;
 		}
 
 		const char **prefix_position, **stem_position , **suffix_position;

From c2957548899d9bca015a4f81a8bc21e0a8acc467 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 12 Jul 2022 00:23:33 +0300
Subject: [PATCH 15/23] anysplit.c: Add #define atomic-unit instead of a
 hardcoded value

---
 data/amy/4.0.affix               |  6 ++
 link-grammar/tokenize/anysplit.c | 95 ++++++++++++++++++--------------
 2 files changed, 60 insertions(+), 41 deletions(-)

diff --git a/data/amy/4.0.affix b/data/amy/4.0.affix
index b6cd8b06e5..bd0bd94e84 100644
--- a/data/amy/4.0.affix
+++ b/data/amy/4.0.affix
@@ -17,6 +17,12 @@
 
 % Anysplit parameters
 
+% A PCRE2 regex defining a character sequence that shouldn't get split.
+% The LG library must be configured with PCRE2 in order to use it. If not,
+% or if this definition is missing, a single utf8 codpoint is used  as a
+% byte sequence that should not get split.
+#define atomic_unit "\X";
+
 % Maximum number of word partitions
 % FYI: 3 barely works, 4 and higher mostly do not work.
 % 6: REGPARTS+;
diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 3a51debafc..06f84c484e 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -403,7 +403,8 @@ void free_anysplit(Dictionary afdict)
 	free_regexs(as->regsuf);
 
 #if HAVE_PCRE2_H
-	gr_pcre2_free(&as->gr);
+	if (as->gr.pattern != NULL)
+		gr_pcre2_free(&as->gr);
 #endif // HAVE_PCRE2_H
 
 	free(as);
@@ -501,25 +502,32 @@ bool anysplit_init(Dictionary afdict)
 	}
 
 #if HAVE_PCRE2_H
-	const char upat[] = "\\X";
-	const char bpat[] = "^(?>";
-	const char epat[] = "(.+)?)$";
-
-	// Build an optional match fore a single grapheme.
-	const unsigned int ubuf_strlen = strlen(upat) + /*()?*/3;
-	char *ubuf = alloca(ubuf_strlen + 1);
-	snprintf(ubuf, ubuf_strlen + 1, "(%s)?", upat);
-
-	// Build a pattern to match all the graphemes in a word: "^(>(\\X)?...)$"
-	as->gr.pattern =
-		malloc(sizeof(bpat)-1 + ubuf_strlen * MAX_WORD_TO_SPLIT + sizeof(epat));
-	strcpy(as->gr.pattern, bpat);
-	unsigned int n = strlen(as->gr.pattern);
-	for (i = 0; i < MAX_WORD_TO_SPLIT; i++, n+= ubuf_strlen)
-		strcpy(&as->gr.pattern[n], ubuf);
-	strcpy(&as->gr.pattern[n], epat);
-
-	if (!gr_reg_comp(&as->gr)) return false;
+	const char *upat = linkgrammar_get_dict_define(afdict, "atomic-unit");
+	if (upat == NULL)
+	{
+		as->gr.pattern = NULL;
+	}
+	else
+	{
+		const char bpat[] = "^(?>";
+		const char epat[] = "(.+)?)$";
+
+		// Build an optional match for a single grapheme.
+		const unsigned int ubuf_strlen = strlen(upat) + /*()?*/3;
+		char *ubuf = alloca(ubuf_strlen + 1);
+		snprintf(ubuf, ubuf_strlen + 1, "(%s)?", upat);
+
+		// Build a pattern to match all the graphemes in a word: "^(>(\\X)?...)$"
+		as->gr.pattern =
+			malloc(sizeof(bpat)-1 + ubuf_strlen * MAX_WORD_TO_SPLIT + sizeof(epat));
+		strcpy(as->gr.pattern, bpat);
+		unsigned int n = strlen(as->gr.pattern);
+		for (i = 0; i < MAX_WORD_TO_SPLIT; i++, n+= ubuf_strlen)
+			strcpy(&as->gr.pattern[n], ubuf);
+		strcpy(&as->gr.pattern[n], epat);
+
+		if (!gr_reg_comp(&as->gr)) return false;
+	}
 #endif // HAVE_PCRE2_H
 
 	return true;
@@ -528,19 +536,21 @@ bool anysplit_init(Dictionary afdict)
 
 /*
  * Return the number of units (codepoints or graphemes) in \p word.
- * On error, return 0;
+ * Since \p word shouldn't be a null string, returned 0 means an error.
  */
 static unsigned int strlen_units(anysplit_params *as, const char *word)
 {
-#if !HAVE_PCRE2_H
+#if HAVE_PCRE2_H
+	if (as->gr.pattern != NULL)
+	{
+		// Number of graphemes.
+		int rc = gr_reg_match(word, &as->gr);
+		if (rc <= 1) return 0;
+		return (unsigned int)(rc - 1);
+	}
+#endif // HAVE_PCRE2_H
 	// Number of codepoints.
 	return (unsigned int)utf8_strlen(word);
-#else // HAVE_PCRE2_H
-	// Number of graphemes.
-	int rc = gr_reg_match(word, &as->gr);
-	if (rc <= 1) return 0;
-	return (unsigned int)(rc - 1);
-#endif // !HAVE_PCRE2_H
 }
 
 /**
@@ -554,22 +564,25 @@ static void	build_unit_positions(anysplit_params *as, const char *word,
 	 const unsigned int *word_pos_base = word_pos;
 
 #if HAVE_PCRE2_H
-	PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(as->gr.match_data);
-
-	/* The first [start,end) is of the whole match (the word in this case). */
-	for (unsigned int i = 1; i < nunits + 1; i++)
-		*word_pos++ = (unsigned int)ovector[2*i + 1];
-
-#else
-	unsigned int pos = 0;
-
-	for (unsigned int i = 0; word[i] != '\0'; i = pos)
+	if (as->gr.pattern != NULL)
 	{
-		pos += utf8_charlen(&word[i]);
-		*word_pos++ = pos;
-	}
+		PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(as->gr.match_data);
 
+		/* The first [start,end) is of the whole match (the word in this case). */
+		for (unsigned int i = 1; i < nunits + 1; i++)
+			*word_pos++ = (unsigned int)ovector[2*i + 1];
+	}
+	else
 #endif // HAVE_PCRE2_H
+	{
+		unsigned int pos = 0;
+
+		for (unsigned int i = 0; word[i] != '\0'; i = pos)
+		{
+			pos += utf8_charlen(&word[i]);
+			*word_pos++ = pos;
+		}
+	}
 
 	if (verbosity_level(D_ANYS+1))
 	{

From 74a14684a614646f47c78baa969646286147dc87 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 12 Jul 2022 00:46:04 +0300
Subject: [PATCH 16/23] amy/4.0.affix: Remove regexes for REG*

No need for them after the grapheme-aware separation modification.
---
 data/amy/4.0.affix | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/data/amy/4.0.affix b/data/amy/4.0.affix
index bd0bd94e84..4106fde1c1 100644
--- a/data/amy/4.0.affix
+++ b/data/amy/4.0.affix
@@ -53,16 +53,13 @@
 % For ASCII input, the empty regexes can be used.
 % See the comments in 4.0.affix.
 
-%"" : REGPRE+;
-"^\X+$" : REGPRE+;
+"" : REGPRE+;
 
 % Regex to match the middle parts.
-%"" : REGMID+;
-"^\X+$" : REGMID+;
+"" : REGMID+;
 %".{2,}": REGMID+;
 
 % Regex to match the suffix.
-%"" : REGSUF+;
-"^\X+$" : REGSUF+;
+"" : REGSUF+;
 
 % End of Anysplit parameters.

From 54423ab0cf5114d4e5173c9fe47855f2bd5591ee Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 12 Jul 2022 00:56:01 +0300
Subject: [PATCH 17/23] amy/4.0.regex: Include trailing mark codepoints in
 atomic-unit

This way morpheme candidates (split parts) are not starting with marks.
This looks nicer and gives less splits. I don't know it is more useful.
---
 data/amy/4.0.affix | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/data/amy/4.0.affix b/data/amy/4.0.affix
index 4106fde1c1..15a8647e90 100644
--- a/data/amy/4.0.affix
+++ b/data/amy/4.0.affix
@@ -21,7 +21,8 @@
 % The LG library must be configured with PCRE2 in order to use it. If not,
 % or if this definition is missing, a single utf8 codpoint is used  as a
 % byte sequence that should not get split.
-#define atomic_unit "\X";
+%#define atomic-unit "\X";       % split at grapheme boundaries.
+#define atomic-unit "\X\pM*";   % ... but include trailing mark codepoints.
 
 % Maximum number of word partitions
 % FYI: 3 barely works, 4 and higher mostly do not work.

From e236030a0498e2e4ec55ff6a77b3470f8019ee3d Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 12 Jul 2022 02:09:21 +0300
Subject: [PATCH 18/23] anysplit.c: Change p_end to unsigned int

---
 link-grammar/tokenize/anysplit.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 06f84c484e..735fd1ca70 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -51,10 +51,10 @@
 
 extern const char * const afdict_classname[];
 
-typedef int p_end;     /* partition end in a word (end char position + 1) */
-typedef p_end *p_list; /* list of partitions in a word */
+typedef unsigned int p_end; /* partition end in a word (end char position +1) */
+typedef p_end *p_list;      /* list of partitions in a word */
 
-typedef struct split_cache /* split cached by word length */
+typedef struct split_cache  /* split cached by word length */
 {
 	size_t nsplits;      /* number of splits */
 	p_list sp;           /* list of splits */
@@ -117,7 +117,7 @@ static void printps(int *ps, int n)
 }
 #endif
 
-static void cache_partitions(p_list pl, int *ps, int p)
+static void cache_partitions(p_list pl, unsigned int *ps, int p)
 {
 	memcpy(pl, ps, sizeof(p_end) * p);
 }
@@ -289,7 +289,7 @@ static bool morpheme_match(Sentence sent, const char *word, unsigned int nunits,
 			return false;
 		}
 
-		if (pl[p] == (int)nunits) break;
+		if (pl[p] == nunits) break;
 		upos = word_upos[pl[p] - 1];
 	}
 

From 82a97b0c995cfe96666bbf568a2a94e3362771bf Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Wed, 10 Aug 2022 10:33:28 +0300
Subject: [PATCH 19/23] any/affix_punc: Replace one-char affixes by [[:punct:]]

---
 data/any/affix-punc | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/data/any/affix-punc b/data/any/affix-punc
index 65cddd9f55..19530ae416 100644
--- a/data/any/affix-punc
+++ b/data/any/affix-punc
@@ -1,13 +1,5 @@
-")" "}" "]" ">" » 〉 ） 〕 》 】 ］ 』」 """ "’’" "’" ''.y '.y
-"%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ？！ ….y ....y "”"
-_ - ‐ ‑ ‒ – — ― ～ ━ ー 、
-¢ ₵ ™ ℠ : RPUNC+;
+"’’" ''.y ….y ....y "/[[:punct:]]$/.\0": RPUNC+;
 
-"(" "{" "[" "<" « 〈 （ 〔 《 【 ［ 『 「 """  `` „ “ ‘ ''.x '.x ….x ....x
-¿ ¡ "$"
-_ - ‐ ‑ ‒ – — ― ━ ー ～
-£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺  ℳ  ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점
-† †† ‡ § ¶ © ® ℗ № "#": LPUNC+;
+`` ''.x ….x ....x †† "/^[[:punct:]]/.\0": LPUNC+;
 
--- ‒ – — ― - _ "(" ")" "[" "]" ... … "," ";" ":"
-': MPUNC+;
+-- ... … "/[[:punct:]]/.\0" ': MPUNC+;

From ff92f6a392b6e92439ff4aeac8e0fc20d4fd8dc0 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Wed, 10 Aug 2022 10:35:42 +0300
Subject: [PATCH 20/23] any/affix_punc: Add comments

---
 data/any/affix-punc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/data/any/affix-punc b/data/any/affix-punc
index 19530ae416..849ff287e8 100644
--- a/data/any/affix-punc
+++ b/data/any/affix-punc
@@ -1,3 +1,11 @@
+% Affixes get stripped off the left and right side of words
+% i.e. spaces are inserted between the affix and the word itself.
+
+% An LPUNC/RPUNC/MPUNC token can be specified as "/regex/.\N", when \N is
+% the capture group that should match the affix (the whole pattern is
+% capture group 0). Disregarding the position in which they appear, they
+% are checked last - but in the same order. (Experimental.)
+
 "’’" ''.y ….y ....y "/[[:punct:]]$/.\0": RPUNC+;
 
 `` ''.x ….x ....x †† "/^[[:punct:]]/.\0": LPUNC+;

From 8b9f32eb8b22bfd646e5933fc6a0303c0427a333 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Wed, 10 Aug 2022 10:43:17 +0300
Subject: [PATCH 21/23] amy/4.0.regex: Accept subscripted punctuation

---
 data/amy/4.0.regex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/amy/4.0.regex b/data/amy/4.0.regex
index db822164bc..66b911b9ce 100644
--- a/data/amy/4.0.regex
+++ b/data/amy/4.0.regex
@@ -18,7 +18,7 @@
 % along with many other punctuation characters that get strip from start
 % and end of words. See the "any/affix-punc" file. These punctuation
 % characters will match here.
-ANY-PUNCT: /^[[:punct:]]+$/
+ANY-PUNCT: /^[[:punct:]]+(:?\x03|$)/
 
 % Multi-part random morphology: match any string as prefix, stem, or suffix.
 % \x03 matches the internal representation of the dot in STEMSUBSCR

From 9fb80f0a1d4e1c87a3fcf99274e954e053bcb198 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Wed, 10 Aug 2022 10:55:39 +0300
Subject: [PATCH 22/23] amy/4.0.regex: Update the comments

---
 data/amy/4.0.regex | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/data/amy/4.0.regex b/data/amy/4.0.regex
index 66b911b9ce..83367ab5ec 100644
--- a/data/amy/4.0.regex
+++ b/data/amy/4.0.regex
@@ -8,21 +8,19 @@
 % The regexes here use the PCRE2 pattern syntax.
 % The LG library must be configured with PCRE2 in order to use them.
 
-% \X matches any Unicode grapheme.
-% Since most of the script-specific punctuation characters are not in
-% the affix-punc file, they are allowed here to join to the end word/parts
+% \X matches any Unicode grapheme. \x03  matches the internal representation
+% of the dot in STEMSUBSCR (See 4.0.affix).
 %
 % For information on graphemes see: http://www.unicode.org/reports/tr29/
 
-% Hyphenated words, contractions, and words with underbars in them, get split,
-% along with many other punctuation characters that get strip from start
-% and end of words. See the "any/affix-punc" file. These punctuation
-% characters will match here.
+% Punctuation characters are getting strip from start and end of words,
+% and words that contain punctuation are getting split at them.  See the
+% "any/affix-punc" file.
+% These punctuation characters will match here. The \x03 is to match
+% subscripted punctuation that may be specified in this file.
 ANY-PUNCT: /^[[:punct:]]+(:?\x03|$)/
 
 % Multi-part random morphology: match any string as prefix, stem, or suffix.
-% \x03 matches the internal representation of the dot in STEMSUBSCR
-% (See 4.0.affix).
 
 MOR-STEM: /^\X+\x03=$/
 MOR-PREF: /^\X+=$/
@@ -36,7 +34,7 @@ ANY-WORD: /^[^[:punct:]]+$/
 
 % For ASCII input and non-PCRE2 regex libraries you can use these instead:
 % ANY-WORD: /^[[:alnum:]]+$/
-% ANY-PUNCT: /^[[:punct:]]+$/
+% ANY-PUNCT: /^[[:punct:]].*$/  % The .* is to match an optional subscript.
 % MOR-PREF: /^[[:alnum:]]+=$/
 % MOR-STEM: /^[[:alnum:]]+.=$/
 % MOR-SUFF: /^=[[:alnum:]]+$/

From 56247cb71840cdb6338a9ee3abd2dc76575ae760 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Fri, 12 Aug 2022 17:31:35 +0300
Subject: [PATCH 23/23] afdict_init(): Validate affixes
 w/dictionary_word_is_known()

...instead of dict_has_word(), to allow punctuation that match a regex.
---
 link-grammar/dict-common/dict-impl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c
index 8f97034331..1e3d83448b 100644
--- a/link-grammar/dict-common/dict-impl.c
+++ b/link-grammar/dict-common/dict-impl.c
@@ -805,7 +805,7 @@ bool afdict_init(Dictionary dict)
 
 				for (int n = 0;  n < ac->length - ac->Nregexes; n++)
 				{
-					if (!dict_has_word(dict, ac->string[n]))
+					if (!dictionary_word_is_known(dict, ac->string[n]))
 					{
 						if (!not_in_dict)
 						{