Skip to content

Commit 8154757

Browse files
authored
[wasm][globalization] HybridGlobalization fix bug in change case (#86799)
* Fix + test. * Fix surrogates problem, document final sigma. * Update change-case.ts * Fix NLS
1 parent 0a3fb4c commit 8154757

File tree

3 files changed

+152
-12
lines changed

3 files changed

+152
-12
lines changed

docs/design/features/hybrid-globalization.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ Affected public APIs:
2727
- TextInfo.ToTitleCase.
2828

2929
Case change with invariant culture uses `toUpperCase` / `toLoweCase` functions that do not guarantee a full match with the original invariant culture.
30+
Hybrid case change, same as ICU-based, does not support code points expansion e.g. "straße" -> "STRAßE".
31+
32+
- Final sigma behavior correction:
33+
34+
ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ".
3035

3136
**String comparison**
3237

src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,17 @@ public static IEnumerable<object[]> ToLower_TestData()
272272
// these sorts of expansions, since it would cause string lengths to change when cased,
273273
// which is non-intuitive. In addition, there are some context sensitive mappings which
274274
// we also don't preform.
275-
// Greek Capital Letter Sigma (does not to case to U+03C2 with "final sigma" rule).
275+
// Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule).
276276
yield return new object[] { cultureName, "\u03A3", "\u03C3" };
277+
if (PlatformDetection.IsHybridGlobalizationOnBrowser)
278+
{
279+
// JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
280+
yield return new object[] { cultureName, "O\u03A3", "o\u03C2" };
281+
}
282+
else
283+
{
284+
yield return new object[] { cultureName, "O\u03A3", "o\u03C3" };
285+
}
277286
}
278287

279288
foreach (string cultureName in GetTestLocales())
@@ -393,7 +402,10 @@ public static IEnumerable<object[]> ToUpper_TestData()
393402
// which is non-intuitive. In addition, there are some context sensitive mappings which
394403
// we also don't preform.
395404
// es-zed does not case to SS when uppercased.
396-
yield return new object[] { cultureName, "\u00DF", "\u00DF" };
405+
yield return new object[] { cultureName, "\u00DF", "\u00DF" };
406+
yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
407+
if (!PlatformDetection.IsNlsGlobalization)
408+
yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };
397409

398410
// Ligatures do not expand when cased.
399411
yield return new object[] { cultureName, "\uFB00", "\uFB00" };

src/mono/wasm/runtime/hybrid-globalization/change-case.ts

Lines changed: 133 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,75 @@ import { monoStringToString, utf16ToStringLoop, stringToUTF16 } from "../strings
66
import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/internal";
77
import { Int32Ptr } from "../types/emscripten";
88
import { wrap_error_root, wrap_no_error_root } from "../invoke-js";
9+
import { localHeapViewU16, setU16_local } from "../memory";
10+
11+
const SURROGATE_HIGHER_START = "\uD800";
12+
const SURROGATE_HIGHER_END = "\uDBFF";
13+
const SURROGATE_LOWER_START = "\uDC00";
14+
const SURROGATE_LOWER_END = "\uDFFF";
915

1016
export function mono_wasm_change_case_invariant(src: number, srcLength: number, dst: number, dstLength: number, toUpper: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): void {
1117
const exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
1218
try {
1319
const input = utf16ToStringLoop(src, src + 2 * srcLength);
14-
let result = toUpper ? input.toUpperCase() : input.toLowerCase();
20+
const result = toUpper ? input.toUpperCase() : input.toLowerCase();
1521
// Unicode defines some codepoints which expand into multiple codepoints,
1622
// originally we do not support this expansion
17-
if (result.length > dstLength)
18-
result = input;
19-
stringToUTF16(dst, dst + 2 * dstLength, result);
20-
wrap_no_error_root(is_exception, exceptionRoot);
23+
if (result.length <= dstLength)
24+
{
25+
stringToUTF16(dst, dst + 2 * dstLength, result);
26+
wrap_no_error_root(is_exception, exceptionRoot);
27+
return;
28+
}
29+
30+
// workaround to maintain the ICU-like behavior
31+
const heapI16 = localHeapViewU16();
32+
let jump = 1;
33+
if (toUpper)
34+
{
35+
for (let i=0; i < input.length; i+=jump)
36+
{
37+
// surrogate parts have to enter ToUpper/ToLower together to give correct output
38+
if (isSurrogate(input, i))
39+
{
40+
jump = 2;
41+
const surrogate = input.substring(i, i+2);
42+
const upperSurrogate = surrogate.toUpperCase();
43+
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
44+
appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);
45+
46+
}
47+
else
48+
{
49+
jump = 1;
50+
const upperChar = input[i].toUpperCase();
51+
const appendedChar = upperChar.length > 1 ? input[i] : upperChar;
52+
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
53+
}
54+
}
55+
}
56+
else
57+
{
58+
for (let i=0; i < input.length; i+=jump)
59+
{
60+
if (isSurrogate(input, i))
61+
{
62+
jump = 2;
63+
const surrogate = input.substring(i, i+2);
64+
const upperSurrogate = surrogate.toLowerCase();
65+
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
66+
appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);
67+
68+
}
69+
else
70+
{
71+
jump = 1;
72+
const upperChar = input[i].toLowerCase();
73+
const appendedChar = upperChar.length > 1 ? input[i] : upperChar;
74+
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
75+
}
76+
}
77+
}
2178
}
2279
catch (ex: any) {
2380
wrap_error_root(is_exception, ex, exceptionRoot);
@@ -35,11 +92,62 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe
3592
if (!cultureName)
3693
throw new Error("Cannot change case, the culture name is null.");
3794
const input = utf16ToStringLoop(src, src + 2 * srcLength);
38-
let result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName);
39-
if (result.length > dstLength)
40-
result = input;
95+
const result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName);
96+
97+
if (result.length <= input.length)
98+
{
99+
stringToUTF16(dst, dst + 2 * dstLength, result);
100+
wrap_no_error_root(is_exception, exceptionRoot);
101+
return;
102+
}
103+
// workaround to maintain the ICU-like behavior
104+
const heapI16 = localHeapViewU16();
105+
let jump = 1;
106+
if (toUpper)
107+
{
108+
for (let i=0; i < input.length; i+=jump)
109+
{
110+
// surrogate parts have to enter ToUpper/ToLower together to give correct output
111+
if (isSurrogate(input, i))
112+
{
113+
jump = 2;
114+
const surrogate = input.substring(i, i+2);
115+
const upperSurrogate = surrogate.toLocaleUpperCase(cultureName);
116+
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
117+
appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);
41118

42-
stringToUTF16(dst, dst + 2 * dstLength, result);
119+
}
120+
else
121+
{
122+
jump = 1;
123+
const upperChar = input[i].toLocaleUpperCase(cultureName);
124+
const appendedChar = upperChar.length > 1 ? input[i] : upperChar;
125+
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
126+
}
127+
}
128+
}
129+
else
130+
{
131+
for (let i=0; i < input.length; i+=jump)
132+
{
133+
// surrogate parts have to enter ToUpper/ToLower together to give correct output
134+
if (isSurrogate(input, i))
135+
{
136+
jump = 2;
137+
const surrogate = input.substring(i, i+2);
138+
const upperSurrogate = surrogate.toLocaleLowerCase(cultureName);
139+
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
140+
appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);
141+
}
142+
else
143+
{
144+
jump = 1;
145+
const lowerChar = input[i].toLocaleLowerCase(cultureName);
146+
const appendedChar = lowerChar.length > 1 ? input[i] : lowerChar;
147+
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
148+
}
149+
}
150+
}
43151
wrap_no_error_root(is_exception, exceptionRoot);
44152
}
45153
catch (ex: any) {
@@ -49,4 +157,19 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe
49157
cultureRoot.release();
50158
exceptionRoot.release();
51159
}
52-
}
160+
}
161+
162+
function isSurrogate(str: string, startIdx: number) : boolean
163+
{
164+
return SURROGATE_HIGHER_START <= str[startIdx] &&
165+
str[startIdx] <= SURROGATE_HIGHER_END &&
166+
startIdx+1 < str.length &&
167+
SURROGATE_LOWER_START <= str[startIdx+1] &&
168+
str[startIdx+1] <= SURROGATE_LOWER_END;
169+
}
170+
171+
function appendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number)
172+
{
173+
setU16_local(heapI16, dst + idx*2, surrogate.charCodeAt(0));
174+
setU16_local(heapI16, dst + (idx+1)*2, surrogate.charCodeAt(1));
175+
}

0 commit comments

Comments
 (0)