Skip to content

Commit 67a4427

Browse files
committed
ICU-22511 Handling unpaired surrogates and improved test cases.
1 parent deeed0f commit 67a4427

File tree

4 files changed

+48
-16
lines changed

4 files changed

+48
-16
lines changed

icu4c/source/i18n/utf16collationiterator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
303303
// To test possible trailing ccc, we need to check high surrogate
304304
// (or previous character for broken surrogate pair).
305305
if (pos - 1 != start) {
306-
normalizePrevSegment = CollationFCD::hasTccc(*(pos - 2));
306+
normalizePrevSegment = U16_IS_LEAD(*(pos - 2)) && CollationFCD::hasTccc(*(pos - 2));
307307
}
308308
} else {
309309
normalizePrevSegment = CollationFCD::hasTccc(*(pos - 1));

icu4c/source/test/intltest/collationtest.cpp

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2128,12 +2128,25 @@ void CollationTest::TestCollatorMap() {
21282128

21292129
void CollationTest::TestColItrInfiniteLoop22511() {
21302130
IcuTestErrorCode errorCode(*this, "TestColItrInfiniteLoop22511");
2131-
char16_t str1[] = {
2132-
0x0100, 0x032a, 0x01e0, 0xd804, 0xdd00, 0x031c
2133-
};
2134-
char16_t str2[] = {
2135-
0x0041, 0x0304, 0x032a, 0x01e0, 0xd804, 0xdd00, 0x031c
2131+
const char16_t* testCases[][2] = {
2132+
{
2133+
u"\u0100\u032a\u01e0\U00011100\u031c",
2134+
u"A\u0304\u032a\u01e0\U00011100\u031c" // Equivalent to above, but U+0100 is decomposed to U+0041 U+0304.
2135+
},
2136+
{
2137+
u"\u0100\u032a\u01e0\xdd00\u031c", // High surrogate 0xd804 is dropped
2138+
u"A\u0304\u032a\u01e0\xdd00\u031c" // Equivalent to above, but U+0100 is decomposed
2139+
},
2140+
{
2141+
u"\u0100\u032a\u01e0\xd804\u031c", // Low surrogate 0xdd00 is dropped
2142+
u"A\u0304\u032a\u01e0\xd804\u031c" // Equivalent to above, but U+0100 is decomposed
2143+
},
2144+
{nullptr, nullptr}
21362145
};
2146+
2147+
StringPiece sp1 = u8"\u0100\u032a\u01e0\U00011100\u031c"; // UTF-8 equivalent to str1a
2148+
StringPiece sp2 = u8"A\u0304\u032a\u01e0\U00011100\u031c"; // UTF-8 equivalent to str1b
2149+
21372150
int32_t num_locales = 0;
21382151
const icu::Locale* locales = icu::Locale::getAvailableLocales(num_locales);
21392152
for (int32_t i = 0; i < num_locales; i++) {
@@ -2142,12 +2155,16 @@ void CollationTest::TestColItrInfiniteLoop22511() {
21422155
LocalPointer<Collator> coll(Collator::createInstance(l, errorCode));
21432156
errorCode.assertSuccess();
21442157
coll->setStrength(icu::Collator::IDENTICAL);
2145-
UCollationResult result = coll->compare(
2146-
str1, sizeof(str1)/sizeof(char16_t),
2147-
str2, sizeof(str2)/sizeof(char16_t),
2148-
errorCode);
2158+
for (int j = 0; testCases[j][0] != nullptr; j++) {
2159+
UCollationResult result = coll->compare(testCases[j][0], -1, testCases[j][1], -1, errorCode);
2160+
errorCode.assertSuccess();
2161+
assertEquals(UnicodeString("Locale ") + l.getName() + "UTF16 case:" + j, UCOL_EQUAL, result);
2162+
}
2163+
2164+
// Also test the UTF-8 versions
2165+
UCollationResult result = coll->compareUTF8(sp1, sp2, errorCode);
21492166
errorCode.assertSuccess();
2150-
assertEquals(UnicodeString("Locale ") + l.getName(), UCOL_EQUAL, result);
2167+
assertEquals(UnicodeString("Locale ") + l.getName() + " UTF-8", UCOL_EQUAL, result);
21512168
}
21522169
}
21532170

icu4j/main/collate/src/main/java/com/ibm/icu/impl/coll/FCDUTF16CollationIterator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ public int previousCodePoint() {
144144
// To test possible trailing ccc, we need to check high surrogate
145145
// (or previous character for broken surrogate pair).
146146
if (pos - 1 != start) {
147-
normalizePrevSegment = CollationFCD.hasTccc(seq.charAt(pos - 2));
147+
normalizePrevSegment = Character.isHighSurrogate(seq.charAt(pos - 2)) && CollationFCD.hasTccc(seq.charAt(pos - 2));
148148
}
149149
} else {
150150
normalizePrevSegment = CollationFCD.hasTccc(seq.charAt(pos - 1));

icu4j/main/collate/src/test/java/com/ibm/icu/dev/test/collator/CollationTest.java

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1965,15 +1965,30 @@ public void TestBuilderContextsOverflow() {
19651965
public void TestColItrInfiniteLoop22511() {
19661966
// ICU-22511 Locale vi and wo triggers infinite loop for getting
19671967
// collation key for these strings.
1968-
final String str1 = "\u0100\u032a\u01e0\ud804\udd00\u031c";
1969-
final String str2 = "\u0041\u0304\u032a\u01e0\ud804\udd00\u031c";
1968+
String[][] testCases = {
1969+
{
1970+
"\u0100\u032a\u01e0\ud804\udd00\u031c",
1971+
"\u0041\u0304\u032a\u01e0\ud804\udd00\u031c" // Equivalent to above, but U+0100 is decomposed to U+0041 U+0304
1972+
},
1973+
{
1974+
"\u0100\u032a\u01e0\udd00\u031c", // High surrogate 0xd804 is dropped
1975+
"\u0041\u0304\u032a\u01e0\udd00\u031c" // Equivalent to above, but U+0100 is decomposed
1976+
},
1977+
{
1978+
"\u0100\u032a\u01e0\ud804\u031c", // Low surrogate 0xdd00 is dropped
1979+
"\u0041\u0304\u032a\u01e0\ud804\u031c" // Equivalent to above, but U+0100 is decomposed
1980+
}
1981+
};
19701982

19711983
ULocale[] locales = ULocale.getAvailableLocales();
19721984
for (ULocale loc : locales) {
19731985
Collator coll = Collator.getInstance(loc);
19741986
coll.setStrength(Collator.IDENTICAL);
1975-
int cmp = coll.compare(str1, str2);
1976-
assertEquals("Locale " + loc.toString(), 0, cmp);
1987+
for (int i = 0; i < testCases.length; i++) {
1988+
String[] testCase = testCases[i];
1989+
int cmp = coll.compare(testCase[0], testCase[1]);
1990+
assertEquals("Locale " + loc.toString() + " case:" + i, 0, cmp);
1991+
}
19771992
}
19781993
}
19791994
}

0 commit comments

Comments
 (0)