diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt
index f3b16ded6790..12840aec7f39 100644
--- a/icu4c/source/data/brkitr/rules/char.txt
+++ b/icu4c/source/data/brkitr/rules/char.txt
@@ -24,13 +24,9 @@ $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
$Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
-#
-# From cldr/common/properties/segments/
-# and issue CLDR-10994
-#
-$Virama = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Virama}];
-$LinkingConsonant = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Consonant}];
-$ExtCccZwj = [[\p{gcb=Extend}-\p{ccc=0}] \p{gcb=ZWJ}];
+$InCBConsonant = [\p{InCB=Consonant}];
+$InCBExtend = [\p{InCB=Extend}];
+$InCBLinker = [\p{InCB=Linker}];
# Korean Syllable Definitions
#
@@ -64,8 +60,8 @@ $L ($L | $V | $LV | $LVT);
# GB 9b
$Prepend [^$Control $CR $LF];
-# GB 9.3, from CLDR-10994
-$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant;
+# GB 9c
+$InCBConsonant [ $InCBExtend $InCBLinker ]* $InCBLinker [ $InCBExtend $InCBLinker ]* $InCBConsonant;
# GB 11 Do not break within emoji modifier sequences or emoji zwj sequences.
$Extended_Pict $Extend* $ZWJ $Extended_Pict;
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index 31897a19ba04..6211af765abe 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -1655,9 +1655,9 @@ class RBBICharMonkey: public RBBIMonkeyKind {
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
UnicodeSet *fExtendedPictSet;
- UnicodeSet *fViramaSet;
- UnicodeSet *fLinkingConsonantSet;
- UnicodeSet *fExtCccZwjSet;
+ UnicodeSet *fInCBLinkerSet;
+ UnicodeSet *fInCBConsonantSet;
+ UnicodeSet *fInCBExtendSet;
UnicodeSet *fAnySet;
const UnicodeString *fText;
@@ -1690,11 +1690,9 @@ RBBICharMonkey::RBBICharMonkey() {
fHangulSet->addAll(*fLVTSet);
fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
- fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
- "\\p{Indic_Syllabic_Category=Virama}]", status);
- fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
- "\\p{Indic_Syllabic_Category=Consonant}]", status);
- fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
+ fInCBLinkerSet = new UnicodeSet(u"[\\p{InCB=Linker}]", status);
+ fInCBConsonantSet = new UnicodeSet(u"[\\p{InCB=Consonant}]", status);
+ fInCBExtendSet = new UnicodeSet(u"[\\p{InCB=Extend}]", status);
fAnySet = new UnicodeSet(0, 0x10ffff);
// Create sets of characters, and add the names of the above character sets.
@@ -1713,9 +1711,9 @@ RBBICharMonkey::RBBICharMonkey() {
sets.emplace_back(*fHangulSet); classNames.emplace_back("Hangul");
sets.emplace_back(*fZWJSet); classNames.emplace_back("ZWJ");
sets.emplace_back(*fExtendedPictSet); classNames.emplace_back("ExtendedPict");
- sets.emplace_back(*fViramaSet); classNames.emplace_back("Virama");
- sets.emplace_back(*fLinkingConsonantSet); classNames.emplace_back("LinkingConsonant");
- sets.emplace_back(*fExtCccZwjSet); classNames.emplace_back("ExtCcccZwj");
+ sets.emplace_back(*fInCBLinkerSet); classNames.emplace_back("InCB=Linker");
+ sets.emplace_back(*fInCBConsonantSet); classNames.emplace_back("InCB=Consonant");
+ sets.emplace_back(*fInCBExtendSet); classNames.emplace_back("InCB=Extend");
sets.emplace_back(*fAnySet); classNames.emplace_back("Any");
if (U_FAILURE(status)) {
@@ -1838,19 +1836,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
continue;
}
- // Note: Viramas are also included in the ExtCccZwj class.
- if (fLinkingConsonantSet->contains(c2)) {
+ if (fInCBConsonantSet->contains(c2)) {
int pi = p1;
bool sawVirama = false;
- while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
- if (fViramaSet->contains(fText->char32At(pi))) {
+ while (pi > 0 && (fInCBExtendSet->contains(fText->char32At(pi)) ||
+ fInCBLinkerSet->contains(fText->char32At(pi)))) {
+ if (fInCBLinkerSet->contains(fText->char32At(pi))) {
sawVirama = true;
}
pi = fText->moveIndex32(pi, -1);
}
- if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
- setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* x LinkingConsonant");
- continue;
+ if (sawVirama && fInCBConsonantSet->contains(fText->char32At(pi))) {
+ setAppliedRule(
+ p2, R"(GB9c \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant})");
+ continue;
}
}
@@ -1903,9 +1902,9 @@ RBBICharMonkey::~RBBICharMonkey() {
delete fAnySet;
delete fZWJSet;
delete fExtendedPictSet;
- delete fViramaSet;
- delete fLinkingConsonantSet;
- delete fExtCccZwjSet;
+ delete fInCBLinkerSet;
+ delete fInCBConsonantSet;
+ delete fInCBExtendSet;
}
//------------------------------------------------------------------------------------------
diff --git a/icu4c/source/test/testdata/break_rules/grapheme.txt b/icu4c/source/test/testdata/break_rules/grapheme.txt
index d5776f33c206..0a811057a579 100644
--- a/icu4c/source/test/testdata/break_rules/grapheme.txt
+++ b/icu4c/source/test/testdata/break_rules/grapheme.txt
@@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
LF = [\p{Grapheme_Cluster_Break = LF}];
Control = [[\p{Grapheme_Cluster_Break = Control}]];
-Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
+Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
@@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
Extended_Pict = [:ExtPict:];
# Indic Sequences
-Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];
-
-LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];
-
-ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
+InCBLinker = [\p{InCB=Linker}];
+InCBConsonant = [\p{InCB=Consonant}];
+InCBExtend = [\p{InCB=Extend}];
GB3: CR LF;
GB4: (Control | CR | LF) ÷;
@@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
GB7: (LV | V) (V | T);
GB8: (LVT | T) T;
-GB11: Extended_Pict Extend* ZWJ Extended_Pict;
-GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
-GB9: . (Extend | ZWJ);
+GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
+GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
+GB9: . (Extend_ | ZWJ);
GB9a: . SpacingMark;
GB9b: Prepend .;
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
index 7d77588ef977..328d31231e62 100644
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -169,18 +169,9 @@
#
#•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •
-#
-# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
-# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
-# Sample Chars: LinkingConsonant: \u0915
-# Virama: \u094d [also Extend]
-# ExtCccZWJ: \u0308
-# Extend but not ExtCCCZWJ \u093A
-
-•\u0915\u094d\u0915•
-•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•
-•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•
-•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•
+# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
+# This test would have caught ICU-22956.
+•સૻ્સૻ•
#
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt
diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk
index 16a9aceee89a..fd22a1c22e5b 100644
Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/char.brk differ
diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
index cabbfc50f438..d212c7ae67ea 100644
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -145,9 +145,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
UnicodeSet fHangulSet;
UnicodeSet fZWJSet;
UnicodeSet fExtendedPictSet;
- UnicodeSet fViramaSet;
- UnicodeSet fLinkingConsonantSet;
- UnicodeSet fExtCccZwjSet;
+ UnicodeSet fInCBLinkerSet;
+ UnicodeSet fInCBConsonantSet;
+ UnicodeSet fInCBExtendSet;
UnicodeSet fAnySet;
@@ -176,11 +176,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
fHangulSet.addAll(fLVTSet);
fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]");
- fViramaSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
- + "\\p{Indic_Syllabic_Category=Virama}]");
- fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
- + "\\p{Indic_Syllabic_Category=Consonant}]");
- fExtCccZwjSet = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]");
+ fInCBLinkerSet = new UnicodeSet("[\\p{InCB=Linker}]");
+ fInCBConsonantSet = new UnicodeSet("[\\p{InCB=Consonant}]");
+ fInCBExtendSet = new UnicodeSet("[\\p{InCB=Extend}]");
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
@@ -196,9 +194,9 @@ static class RBBICharMonkey extends RBBIMonkeyKind {
fSets.add(fAnySet); fClassNames.add("Any");
fSets.add(fZWJSet); fClassNames.add("ZWJ");
fSets.add(fExtendedPictSet); fClassNames.add("ExtendedPict");
- fSets.add(fViramaSet); fClassNames.add("Virama");
- fSets.add(fLinkingConsonantSet); fClassNames.add("LinkingConsonant");
- fSets.add(fExtCccZwjSet); fClassNames.add("ExtCccZwj");
+ fSets.add(fInCBLinkerSet); fClassNames.add("InCB=Linker");
+ fSets.add(fInCBConsonantSet); fClassNames.add("InCB=Consonant");
+ fSets.add(fInCBExtendSet); fClassNames.add("InCB=Extend");
}
@@ -315,17 +313,18 @@ int next(int prevPos) {
}
// Note: Viramas are also included in the ExtCccZwj class.
- if (fLinkingConsonantSet.contains(c2)) {
+ if (fInCBConsonantSet.contains(c2)) {
int pi = p1;
boolean sawVirama = false;
- while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) {
- if (fViramaSet.contains(fText.codePointAt(pi))) {
+ while (pi > 0 && (fInCBExtendSet.contains(fText.codePointAt(pi)) ||
+ fInCBLinkerSet.contains(fText.codePointAt(pi)))) {
+ if (fInCBLinkerSet.contains(fText.codePointAt(pi))) {
sawVirama = true;
}
pi = fText.offsetByCodePoints(pi, -1);
}
- if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) {
- setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
+ if (sawVirama && fInCBConsonantSet.contains(fText.codePointAt(pi))) {
+ setAppliedRule(p2, "GB9c \\p{InCB=Consonant} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* \\p{InCB=Linker} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* × \\p{InCB=Consonant})");
continue;
}
}
diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt
index d5776f33c206..0a811057a579 100644
--- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt
+++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/grapheme.txt
@@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
LF = [\p{Grapheme_Cluster_Break = LF}];
Control = [[\p{Grapheme_Cluster_Break = Control}]];
-Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
+Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
@@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
Extended_Pict = [:ExtPict:];
# Indic Sequences
-Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];
-
-LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];
-
-ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
+InCBLinker = [\p{InCB=Linker}];
+InCBConsonant = [\p{InCB=Consonant}];
+InCBExtend = [\p{InCB=Extend}];
GB3: CR LF;
GB4: (Control | CR | LF) ÷;
@@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
GB7: (LV | V) (V | T);
GB8: (LVT | T) T;
-GB11: Extended_Pict Extend* ZWJ Extended_Pict;
-GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
-GB9: . (Extend | ZWJ);
+GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
+GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
+GB9: . (Extend_ | ZWJ);
GB9a: . SpacingMark;
GB9b: Prepend .;
diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt
index 7d77588ef977..328d31231e62 100644
--- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -169,18 +169,9 @@
#
#•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •
-#
-# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
-# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
-# Sample Chars: LinkingConsonant: \u0915
-# Virama: \u094d [also Extend]
-# ExtCccZWJ: \u0308
-# Extend but not ExtCCCZWJ \u093A
-
-•\u0915\u094d\u0915•
-•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•
-•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•
-•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•
+# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
+# This test would have caught ICU-22956.
+•સૻ્સૻ•
#
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt