Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/design/features/hybrid-globalization.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ Affected public APIs:
- TextInfo.ToTitleCase.

Case change with invariant culture uses `toUpperCase` / `toLoweCase` functions that do not guarantee a full match with the original invariant culture.
Hybrid case change, same as ICU-based, does not support code points expansion e.g. "straße" -> "STRAßE".

- Final sigma behavior correction:

ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ".

**String comparison**

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,17 @@ public static IEnumerable<object[]> ToLower_TestData()
// these sorts of expansions, since it would cause string lengths to change when cased,
// which is non-intuitive. In addition, there are some context sensitive mappings which
// we also don't preform.
// Greek Capital Letter Sigma (does not to case to U+03C2 with "final sigma" rule).
// Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule).
yield return new object[] { cultureName, "\u03A3", "\u03C3" };
if (PlatformDetection.IsHybridGlobalizationOnBrowser)
{
// JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
yield return new object[] { cultureName, "O\u03A3", "o\u03C2" };
}
else
{
yield return new object[] { cultureName, "O\u03A3", "o\u03C3" };
}
}

foreach (string cultureName in GetTestLocales())
Expand Down Expand Up @@ -394,6 +403,8 @@ public static IEnumerable<object[]> ToUpper_TestData()
// we also don't preform.
// es-zed does not case to SS when uppercased.
yield return new object[] { cultureName, "\u00DF", "\u00DF" };
yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };

// Ligatures do not expand when cased.
yield return new object[] { cultureName, "\uFB00", "\uFB00" };
Expand Down
143 changes: 133 additions & 10 deletions src/mono/wasm/runtime/hybrid-globalization/change-case.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,75 @@ import { monoStringToString, utf16ToStringLoop, stringToUTF16 } from "../strings
import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/internal";
import { Int32Ptr } from "../types/emscripten";
import { wrap_error_root, wrap_no_error_root } from "../invoke-js";
import { localHeapViewU16, setU16_local } from "../memory";

const SURROGATE_HIGHER_START = "\uD800";
const SURROGATE_HIGHER_END = "\uDBFF";
const SURROGATE_LOWER_START = "\uDC00";
const SURROGATE_LOWER_END = "\uDFFF";

export function mono_wasm_change_case_invariant(src: number, srcLength: number, dst: number, dstLength: number, toUpper: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): void {
const exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
try {
const input = utf16ToStringLoop(src, src + 2 * srcLength);
let result = toUpper ? input.toUpperCase() : input.toLowerCase();
const result = toUpper ? input.toUpperCase() : input.toLowerCase();
// Unicode defines some codepoints which expand into multiple codepoints,
// originally we do not support this expansion
if (result.length > dstLength)
result = input;
stringToUTF16(dst, dst + 2 * dstLength, result);
wrap_no_error_root(is_exception, exceptionRoot);
if (result.length <= dstLength)
{
stringToUTF16(dst, dst + 2 * dstLength, result);
wrap_no_error_root(is_exception, exceptionRoot);
return;
}

// workaround to maintain the ICU-like behavior
const heapI16 = localHeapViewU16();
let jump = 1;
if (toUpper)
{
for (let i=0; i < input.length; i+=jump)
{
// surrogate parts have to enter ToUpper/ToLower together to give correct output
if (IsSurrogate(input, i))
{
jump = 2;
const surrogate = input.substring(i, i+2);
const upperSurrogate = surrogate.toUpperCase();
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);

}
else
{
jump = 1;
const upperChar = input[i].toUpperCase();
const appendedChar = upperChar.length > 1 ? input[i] : upperChar;
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
}
}
}
else
{
for (let i=0; i < input.length; i+=jump)
{
if (IsSurrogate(input, i))
{
jump = 2;
const surrogate = input.substring(i, i+2);
const upperSurrogate = surrogate.toLowerCase();
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);

}
else
{
jump = 1;
const upperChar = input[i].toLowerCase();
const appendedChar = upperChar.length > 1 ? input[i] : upperChar;
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
}
}
}
}
catch (ex: any) {
wrap_error_root(is_exception, ex, exceptionRoot);
Expand All @@ -35,11 +92,62 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe
if (!cultureName)
throw new Error("Cannot change case, the culture name is null.");
const input = utf16ToStringLoop(src, src + 2 * srcLength);
let result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName);
if (result.length > dstLength)
result = input;
const result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName);

if (result.length <= input.length)
{
stringToUTF16(dst, dst + 2 * dstLength, result);
wrap_no_error_root(is_exception, exceptionRoot);
return;
}
// workaround to maintain the ICU-like behavior
const heapI16 = localHeapViewU16();
let jump = 1;
if (toUpper)
{
for (let i=0; i < input.length; i+=jump)
{
// surrogate parts have to enter ToUpper/ToLower together to give correct output
if (IsSurrogate(input, i))
{
jump = 2;
const surrogate = input.substring(i, i+2);
const upperSurrogate = surrogate.toLocaleUpperCase(cultureName);
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);

stringToUTF16(dst, dst + 2 * dstLength, result);
}
else
{
jump = 1;
const upperChar = input[i].toLocaleUpperCase(cultureName);
const appendedChar = upperChar.length > 1 ? input[i] : upperChar;
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
}
}
}
else
{
for (let i=0; i < input.length; i+=jump)
{
// surrogate parts have to enter ToUpper/ToLower together to give correct output
if (IsSurrogate(input, i))
{
jump = 2;
const surrogate = input.substring(i, i+2);
const upperSurrogate = surrogate.toLocaleLowerCase(cultureName);
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);
}
else
{
jump = 1;
const lowerChar = input[i].toLocaleLowerCase(cultureName);
const appendedChar = lowerChar.length > 1 ? input[i] : lowerChar;
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
}
}
}
wrap_no_error_root(is_exception, exceptionRoot);
}
catch (ex: any) {
Expand All @@ -49,4 +157,19 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe
cultureRoot.release();
exceptionRoot.release();
}
}
}

function IsSurrogate(str: string, startIdx: number) : boolean
{
return SURROGATE_HIGHER_START <= str[startIdx] &&
str[startIdx] <= SURROGATE_HIGHER_END &&
startIdx+1 < str.length &&
SURROGATE_LOWER_START <= str[startIdx+1] &&
str[startIdx+1] <= SURROGATE_LOWER_END;
}

function AppendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number)
{
setU16_local(heapI16, dst + idx*2, surrogate.charCodeAt(0));
setU16_local(heapI16, dst + (idx+1)*2, surrogate.charCodeAt(1));
}