Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22956 Use InCB for grapheme cluster segmentation #3257

Merged
merged 1 commit into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions icu4c/source/data/brkitr/rules/char.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,9 @@ $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
$Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];

#
# From cldr/common/properties/segments/
# and issue CLDR-10994
#
$Virama = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Virama}];
$LinkingConsonant = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Consonant}];
$ExtCccZwj = [[\p{gcb=Extend}-\p{ccc=0}] \p{gcb=ZWJ}];
$InCBConsonant = [\p{InCB=Consonant}];
$InCBExtend = [\p{InCB=Extend}];
$InCBLinker = [\p{InCB=Linker}];

# Korean Syllable Definitions
#
Expand Down Expand Up @@ -64,8 +60,8 @@ $L ($L | $V | $LV | $LVT);
# GB 9b
$Prepend [^$Control $CR $LF];

# GB 9.3, from CLDR-10994
$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant;
# GB 9c
$InCBConsonant [ $InCBExtend $InCBLinker ]* $InCBLinker [ $InCBExtend $InCBLinker ]* $InCBConsonant;

# GB 11 Do not break within emoji modifier sequences or emoji zwj sequences.
$Extended_Pict $Extend* $ZWJ $Extended_Pict;
Expand Down
41 changes: 20 additions & 21 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1655,9 +1655,9 @@ class RBBICharMonkey: public RBBIMonkeyKind {
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
UnicodeSet *fExtendedPictSet;
UnicodeSet *fViramaSet;
UnicodeSet *fLinkingConsonantSet;
UnicodeSet *fExtCccZwjSet;
UnicodeSet *fInCBLinkerSet;
UnicodeSet *fInCBConsonantSet;
UnicodeSet *fInCBExtendSet;
UnicodeSet *fAnySet;

const UnicodeString *fText;
Expand Down Expand Up @@ -1690,11 +1690,9 @@ RBBICharMonkey::RBBICharMonkey() {
fHangulSet->addAll(*fLVTSet);

fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
"\\p{Indic_Syllabic_Category=Virama}]", status);
fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
"\\p{Indic_Syllabic_Category=Consonant}]", status);
fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
fInCBLinkerSet = new UnicodeSet(u"[\\p{InCB=Linker}]", status);
fInCBConsonantSet = new UnicodeSet(u"[\\p{InCB=Consonant}]", status);
fInCBExtendSet = new UnicodeSet(u"[\\p{InCB=Extend}]", status);
fAnySet = new UnicodeSet(0, 0x10ffff);

// Create sets of characters, and add the names of the above character sets.
Expand All @@ -1713,9 +1711,9 @@ RBBICharMonkey::RBBICharMonkey() {
sets.emplace_back(*fHangulSet); classNames.emplace_back("Hangul");
sets.emplace_back(*fZWJSet); classNames.emplace_back("ZWJ");
sets.emplace_back(*fExtendedPictSet); classNames.emplace_back("ExtendedPict");
sets.emplace_back(*fViramaSet); classNames.emplace_back("Virama");
sets.emplace_back(*fLinkingConsonantSet); classNames.emplace_back("LinkingConsonant");
sets.emplace_back(*fExtCccZwjSet); classNames.emplace_back("ExtCcccZwj");
sets.emplace_back(*fInCBLinkerSet); classNames.emplace_back("InCB=Linker");
sets.emplace_back(*fInCBConsonantSet); classNames.emplace_back("InCB=Consonant");
sets.emplace_back(*fInCBExtendSet); classNames.emplace_back("InCB=Extend");
sets.emplace_back(*fAnySet); classNames.emplace_back("Any");

if (U_FAILURE(status)) {
Expand Down Expand Up @@ -1838,19 +1836,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
continue;
}

// Note: Viramas are also included in the ExtCccZwj class.
if (fLinkingConsonantSet->contains(c2)) {
if (fInCBConsonantSet->contains(c2)) {
int pi = p1;
bool sawVirama = false;
while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
if (fViramaSet->contains(fText->char32At(pi))) {
while (pi > 0 && (fInCBExtendSet->contains(fText->char32At(pi)) ||
fInCBLinkerSet->contains(fText->char32At(pi)))) {
if (fInCBLinkerSet->contains(fText->char32At(pi))) {
sawVirama = true;
}
pi = fText->moveIndex32(pi, -1);
}
if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* x LinkingConsonant");
continue;
if (sawVirama && fInCBConsonantSet->contains(fText->char32At(pi))) {
setAppliedRule(
p2, R"(GB9c \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant})");
continue;
}
}

Expand Down Expand Up @@ -1903,9 +1902,9 @@ RBBICharMonkey::~RBBICharMonkey() {
delete fAnySet;
delete fZWJSet;
delete fExtendedPictSet;
delete fViramaSet;
delete fLinkingConsonantSet;
delete fExtCccZwjSet;
delete fInCBLinkerSet;
delete fInCBConsonantSet;
delete fInCBExtendSet;
}

//------------------------------------------------------------------------------------------
Expand Down
16 changes: 7 additions & 9 deletions icu4c/source/test/testdata/break_rules/grapheme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
LF = [\p{Grapheme_Cluster_Break = LF}];

Control = [[\p{Grapheme_Cluster_Break = Control}]];
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
Expand All @@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
Extended_Pict = [:ExtPict:];

# Indic Sequences
Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];

LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];

ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
InCBLinker = [\p{InCB=Linker}];
InCBConsonant = [\p{InCB=Consonant}];
InCBExtend = [\p{InCB=Extend}];

GB3: CR LF;
GB4: (Control | CR | LF) ÷;
Expand All @@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
GB7: (LV | V) (V | T);
GB8: (LVT | T) T;

GB11: Extended_Pict Extend* ZWJ Extended_Pict;
GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
GB9: . (Extend | ZWJ);
GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
GB9: . (Extend_ | ZWJ);

GB9a: . SpacingMark;
GB9b: Prepend .;
Expand Down
15 changes: 3 additions & 12 deletions icu4c/source/test/testdata/rbbitst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -169,18 +169,9 @@
#
#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>

#
# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
# Sample Chars: LinkingConsonant: \u0915
# Virama: \u094d [also Extend]
# ExtCccZWJ: \u0308
# Extend but not ExtCCCZWJ \u093A
<char>
<data>•\u0915\u094d\u0915•</data>
<data>•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•</data>
<data>•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•</data>
<data>•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•</data>
# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
# This test would have caught ICU-22956.
<data>•સૻ્સૻ•</data>

#
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
UnicodeSet fHangulSet;
UnicodeSet fZWJSet;
UnicodeSet fExtendedPictSet;
UnicodeSet fViramaSet;
UnicodeSet fLinkingConsonantSet;
UnicodeSet fExtCccZwjSet;
UnicodeSet fInCBLinkerSet;
UnicodeSet fInCBConsonantSet;
UnicodeSet fInCBExtendSet;
UnicodeSet fAnySet;


Expand Down Expand Up @@ -176,11 +176,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
fHangulSet.addAll(fLVTSet);

fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]");
fViramaSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
+ "\\p{Indic_Syllabic_Category=Virama}]");
fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
+ "\\p{Indic_Syllabic_Category=Consonant}]");
fExtCccZwjSet = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]");
fInCBLinkerSet = new UnicodeSet("[\\p{InCB=Linker}]");
fInCBConsonantSet = new UnicodeSet("[\\p{InCB=Consonant}]");
fInCBExtendSet = new UnicodeSet("[\\p{InCB=Extend}]");
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");


Expand All @@ -196,9 +194,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
fSets.add(fAnySet); fClassNames.add("Any");
fSets.add(fZWJSet); fClassNames.add("ZWJ");
fSets.add(fExtendedPictSet); fClassNames.add("ExtendedPict");
fSets.add(fViramaSet); fClassNames.add("Virama");
fSets.add(fLinkingConsonantSet); fClassNames.add("LinkingConsonant");
fSets.add(fExtCccZwjSet); fClassNames.add("ExtCccZwj");
fSets.add(fInCBLinkerSet); fClassNames.add("InCB=Linker");
fSets.add(fInCBConsonantSet); fClassNames.add("InCB=Consonant");
fSets.add(fInCBExtendSet); fClassNames.add("InCB=Extend");
}


Expand Down Expand Up @@ -315,17 +313,18 @@ int next(int prevPos) {
}

// Note: Viramas are also included in the ExtCccZwj class.
if (fLinkingConsonantSet.contains(c2)) {
if (fInCBConsonantSet.contains(c2)) {
int pi = p1;
boolean sawVirama = false;
while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) {
if (fViramaSet.contains(fText.codePointAt(pi))) {
while (pi > 0 && (fInCBExtendSet.contains(fText.codePointAt(pi)) ||
fInCBLinkerSet.contains(fText.codePointAt(pi)))) {
if (fInCBLinkerSet.contains(fText.codePointAt(pi))) {
sawVirama = true;
}
pi = fText.offsetByCodePoints(pi, -1);
}
if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) {
setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
if (sawVirama && fInCBConsonantSet.contains(fText.codePointAt(pi))) {
setAppliedRule(p2, "GB9c \\p{InCB=Consonant} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* \\p{InCB=Linker} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* × \\p{InCB=Consonant})");
continue;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
LF = [\p{Grapheme_Cluster_Break = LF}];

Control = [[\p{Grapheme_Cluster_Break = Control}]];
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
Expand All @@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
Extended_Pict = [:ExtPict:];

# Indic Sequences
Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];

LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];

ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
InCBLinker = [\p{InCB=Linker}];
InCBConsonant = [\p{InCB=Consonant}];
InCBExtend = [\p{InCB=Extend}];

GB3: CR LF;
GB4: (Control | CR | LF) ÷;
Expand All @@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
GB7: (LV | V) (V | T);
GB8: (LVT | T) T;

GB11: Extended_Pict Extend* ZWJ Extended_Pict;
GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
GB9: . (Extend | ZWJ);
GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
GB9: . (Extend_ | ZWJ);

GB9a: . SpacingMark;
GB9b: Prepend .;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,18 +169,9 @@
#
#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>

#
# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
# Sample Chars: LinkingConsonant: \u0915
# Virama: \u094d [also Extend]
# ExtCccZWJ: \u0308
# Extend but not ExtCCCZWJ \u093A
<char>
<data>•\u0915\u094d\u0915•</data>
<data>•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•</data>
<data>•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•</data>
<data>•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•</data>
# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
# This test would have caught ICU-22956.
<data>•સૻ્સૻ•</data>

#
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt
Expand Down