Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 77a09eb

Browse files
Add optimized UTF-8 validation and transcoding apis, hook them up to UTF8Encoding (#21948)
* Add optimized UTF-8 validation and transcoding logic - Hook it up through the existing Utf8 public static APIs - Move some shared methods out of ASCIIUtility - Hook it up through the Utf8String ctor * Hook up new UTF-8 logic through UTF8Encoding - Add vectorized UTF-16 validation and transcoded byte counts - Move Utf16Utility into Unicode namespace alongside Utf8Utility - Fix some bugs in DecoderNLS's draining logic * Improve perf of "is ASCII?" inner loop in UTF-8 validation. * Remove SSE41.X64 optimization from AsciiUtility RyuJIT now handles this optimally * Clarify that vector read is unaligned * Simplify vectorized logic; remove unnecessary adjustment * PR feedback: GetElement(0) -> Sse2.StoreLow * PR feedback - Simplify CountNumberOfLeadingAsciiBytesFrom24BitInteger - Extract some consts out to top of file w/ comments * PR feedback: Enable SSE2 in Utf16Utility code * Expand masks in Utf8Utility, fix const in fallback path * Temporarily disable failing CoreFX tests * Fix incorrect Debug.Assert statements * Add comments tracking JIT workarounds. * Rename DWORD -> UInt32 throughout API surface * Re-flow Utf8Utility.Helpers * PR feedback: Fix typos * PR feedback: CountNumberOfLeadingAsciiBytesFrom24BitInteger * PR feedback: Remove redundant endianess checks * PR feedback: Validate nint definitions * PR feedback: Clarify charIsNonAscii vector usage * PR feedback: document tempUtf8CodeUnitCountAdjustment usage * Fix compilation failure in Utf16Utility * PR feedback: Clarify 3-byte sequence processing * Add missing check to 3-byte processing logic * Clarify comment in 3-byte processing
2 parents daa688d + e307d14 commit 77a09eb

19 files changed

+4227
-2430
lines changed

src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,7 @@
768768
<Compile Include="$(MSBuildThisFileDirectory)System\SystemException.cs" />
769769
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIEncoding.cs" />
770770
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.cs" />
771+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.Helpers.cs" />
771772
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilderCache.cs" />
772773
<Compile Include="$(MSBuildThisFileDirectory)System\Text\CodePageDataItem.cs" />
773774
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Decoder.cs" />
@@ -799,13 +800,17 @@
799800
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeDebug.cs" />
800801
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeEncoding.cs" />
801802
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UnicodeUtility.cs" />
802-
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Utf16Utility.cs" />
803803
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF32Encoding.cs" />
804804
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF7Encoding.cs" />
805805
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" />
806806
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
807+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf16Utility.cs" />
808+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf16Utility.Validation.cs" />
807809
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
808810
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.cs" />
811+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Helpers.cs" />
812+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Transcoding.cs" />
813+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.Validation.cs" />
809814
<Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
810815
<Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
811816
<Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />

src/System.Private.CoreLib/shared/System/Globalization/CompareInfo.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
using System.Runtime.CompilerServices;
99
using System.Runtime.InteropServices;
1010
using System.Runtime.Serialization;
11-
using System.Text;
11+
using System.Text.Unicode;
1212
using Internal.Runtime.CompilerServices;
1313

1414
namespace System.Globalization

src/System.Private.CoreLib/shared/System/Globalization/TextInfo.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
using System.Runtime.InteropServices;
99
using System.Runtime.Serialization;
1010
using System.Text;
11+
using System.Text.Unicode;
1112
using Internal.Runtime.CompilerServices;
1213

1314
#if BIT64

src/System.Private.CoreLib/shared/System/Marvin.OrdinalIgnoreCase.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
using System.Buffers;
66
using System.Diagnostics;
77
using System.Runtime.InteropServices;
8-
using System.Text;
8+
using System.Text.Unicode;
99
using Internal.Runtime.CompilerServices;
1010

1111
#if BIT64

src/System.Private.CoreLib/shared/System/Text/ASCIIEncoding.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ private unsafe int GetByteCountCommon(char* pChars, int charCount)
159159
// Common helper method for all non-EncoderNLS entry points to GetByteCount.
160160
// A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
161161

162-
Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer.");
162+
Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
163163
Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
164164

165165
// First call into the fast path.
@@ -340,9 +340,9 @@ private unsafe int GetBytesCommon(char* pChars, int charCount, byte* pBytes, int
340340
// Common helper method for all non-EncoderNLS entry points to GetBytes.
341341
// A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
342342

343-
Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer.");
343+
Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
344344
Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
345-
Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer.");
345+
Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
346346
Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
347347

348348
// First call into the fast path.
@@ -496,7 +496,7 @@ private unsafe int GetCharCountCommon(byte* pBytes, int byteCount)
496496
// Common helper method for all non-DecoderNLS entry points to GetCharCount.
497497
// A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
498498

499-
Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer.");
499+
Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
500500
Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
501501

502502
// First call into the fast path.
@@ -624,9 +624,9 @@ private unsafe int GetCharsCommon(byte* pBytes, int byteCount, char* pChars, int
624624
// Common helper method for all non-DecoderNLS entry points to GetChars.
625625
// A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
626626

627-
Debug.Assert(byteCount >= 0, "Caller should't specify negative length buffer.");
627+
Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
628628
Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
629-
Debug.Assert(charCount >= 0, "Caller should't specify negative length buffer.");
629+
Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
630630
Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
631631

632632
// First call into the fast path.
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System.Diagnostics;
6+
using System.Numerics;
7+
using System.Runtime.CompilerServices;
8+
using System.Runtime.Intrinsics.X86;
9+
10+
namespace System.Text
11+
{
12+
internal static partial class ASCIIUtility
13+
{
14+
/// <summary>
15+
/// A mask which selects only the high bit of each byte of the given <see cref="uint"/>.
16+
/// </summary>
17+
private const uint UInt32HighBitsOnlyMask = 0x80808080u;
18+
19+
/// <summary>
20+
/// A mask which selects only the high bit of each byte of the given <see cref="ulong"/>.
21+
/// </summary>
22+
private const ulong UInt64HighBitsOnlyMask = 0x80808080_80808080ul;
23+
24+
/// <summary>
25+
/// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
26+
/// </summary>
27+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
28+
internal static bool AllBytesInUInt32AreAscii(uint value)
29+
{
30+
// If the high bit of any byte is set, that byte is non-ASCII.
31+
32+
return (value & UInt32HighBitsOnlyMask) == 0;
33+
}
34+
35+
/// <summary>
36+
/// Given a DWORD which represents a four-byte buffer read in machine endianness, and which
37+
/// the caller has asserted contains a non-ASCII byte *somewhere* in the data, counts the
38+
/// number of consecutive ASCII bytes starting from the beginning of the buffer. Returns
39+
/// a value 0 - 3, inclusive. (The caller is responsible for ensuring that the buffer doesn't
40+
/// contain all-ASCII data.)
41+
/// </summary>
42+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
43+
internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(uint value)
44+
{
45+
Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value.");
46+
47+
// Use BMI1 directly rather than going through BitOperations. We only see a perf gain here
48+
// if we're able to emit a real tzcnt instruction; the software fallback used by BitOperations
49+
// is too slow for our purposes since we can provide our own faster, specialized software fallback.
50+
51+
if (Bmi1.IsSupported)
52+
{
53+
Debug.Assert(BitConverter.IsLittleEndian);
54+
return Bmi1.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
55+
}
56+
57+
// Couldn't emit tzcnt, use specialized software fallback.
58+
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
59+
// on whether all processed bytes were ASCII. Then we accumulate all of the
60+
// results to calculate how many consecutive ASCII bytes are present.
61+
62+
value = ~value;
63+
64+
if (BitConverter.IsLittleEndian)
65+
{
66+
// Read first byte
67+
value >>= 7;
68+
uint allBytesUpToNowAreAscii = value & 1;
69+
uint numAsciiBytes = allBytesUpToNowAreAscii;
70+
71+
// Read second byte
72+
value >>= 8;
73+
allBytesUpToNowAreAscii &= value;
74+
numAsciiBytes += allBytesUpToNowAreAscii;
75+
76+
// Read third byte
77+
value >>= 8;
78+
allBytesUpToNowAreAscii &= value;
79+
numAsciiBytes += allBytesUpToNowAreAscii;
80+
81+
return numAsciiBytes;
82+
}
83+
else
84+
{
85+
// BinaryPrimitives.ReverseEndianness is only implemented as an intrinsic on
86+
// little-endian platforms, so using it in this big-endian path would be too
87+
// expensive. Instead we'll just change how we perform the shifts.
88+
89+
// Read first byte
90+
value = BitOperations.RotateLeft(value, 1);
91+
uint allBytesUpToNowAreAscii = value & 1;
92+
uint numAsciiBytes = allBytesUpToNowAreAscii;
93+
94+
// Read second byte
95+
value = BitOperations.RotateLeft(value, 8);
96+
allBytesUpToNowAreAscii &= value;
97+
numAsciiBytes += allBytesUpToNowAreAscii;
98+
99+
// Read third byte
100+
value = BitOperations.RotateLeft(value, 8);
101+
allBytesUpToNowAreAscii &= value;
102+
numAsciiBytes += allBytesUpToNowAreAscii;
103+
104+
return numAsciiBytes;
105+
}
106+
}
107+
}
108+
}

src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs

Lines changed: 14 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,20 @@ namespace System.Text
2121
{
2222
internal static partial class ASCIIUtility
2323
{
24-
/// <summary>
25-
/// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
26-
/// </summary>
27-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
28-
private static bool AllBytesInUInt32AreAscii(uint value)
24+
#if DEBUG
25+
static ASCIIUtility()
2926
{
30-
return ((value & 0x80808080u) == 0);
27+
Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
28+
Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
3129
}
30+
#endif // DEBUG
3231

3332
[MethodImpl(MethodImplOptions.AggressiveInlining)]
3433
private static bool AllBytesInUInt64AreAscii(ulong value)
3534
{
36-
return ((value & 0x80808080_80808080ul) == 0);
35+
// If the high bit of any byte is set, that byte is non-ASCII.
36+
37+
return ((value & UInt64HighBitsOnlyMask) == 0);
3738
}
3839

3940
/// <summary>
@@ -54,56 +55,6 @@ private static bool AllCharsInUInt64AreAscii(ulong value)
5455
return ((value & ~0x007F007F_007F007Ful) == 0);
5556
}
5657

57-
/// <summary>
58-
/// Given a 24-bit integer which represents a three-byte buffer read in machine endianness,
59-
/// counts the number of consecutive ASCII bytes starting from the beginning of the buffer.
60-
/// Returns a value 0 - 3, inclusive.
61-
/// </summary>
62-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
63-
private static uint CountNumberOfLeadingAsciiBytesFrom24BitInteger(uint value)
64-
{
65-
// This implementation seems to have better performance than tzcnt.
66-
67-
// The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
68-
// on whether all processed bytes were ASCII. Then we accumulate all of the
69-
// results to calculate how many consecutive ASCII bytes are present.
70-
71-
value = ~value;
72-
73-
if (BitConverter.IsLittleEndian)
74-
{
75-
// Read first byte
76-
uint allBytesUpToNowAreAscii = (value >>= 7) & 1;
77-
uint numAsciiBytes = allBytesUpToNowAreAscii;
78-
79-
// Read second byte
80-
allBytesUpToNowAreAscii &= (value >>= 8);
81-
numAsciiBytes += allBytesUpToNowAreAscii;
82-
83-
// Read third byte
84-
allBytesUpToNowAreAscii &= (value >>= 8);
85-
numAsciiBytes += allBytesUpToNowAreAscii;
86-
87-
return numAsciiBytes;
88-
}
89-
else
90-
{
91-
// Read first byte
92-
uint allBytesUpToNowAreAscii = (value = ROL32(value, 1)) & 1;
93-
uint numAsciiBytes = allBytesUpToNowAreAscii;
94-
95-
// Read second byte
96-
allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
97-
numAsciiBytes += allBytesUpToNowAreAscii;
98-
99-
// Read third byte
100-
allBytesUpToNowAreAscii &= (value = ROL32(value, 8));
101-
numAsciiBytes += allBytesUpToNowAreAscii;
102-
103-
return numAsciiBytes;
104-
}
105-
}
106-
10758
/// <summary>
10859
/// Given a DWORD which represents two packed chars in machine-endian order,
10960
/// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.
@@ -273,7 +224,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, n
273224
// we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be
274225
// non-ASCII. In both cases we only care about the low 24 bits.
275226

276-
pBuffer += CountNumberOfLeadingAsciiBytesFrom24BitInteger(currentUInt32);
227+
pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
277228
goto Finish;
278229
}
279230

@@ -435,7 +386,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
435386

436387
uint currentDWord;
437388
Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
438-
pBuffer += CountNumberOfLeadingAsciiBytesFrom24BitInteger(currentDWord);
389+
pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
439390

440391
goto Finish;
441392

@@ -461,7 +412,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
461412
// Clear everything but the high bit of each byte, then tzcnt.
462413
// Remember the / 8 at the end to convert bit count to byte count.
463414

464-
candidateUInt64 &= 0x80808080_80808080ul;
415+
candidateUInt64 &= UInt64HighBitsOnlyMask;
465416
pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
466417
goto Finish;
467418
}
@@ -1395,17 +1346,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pA
13951346
// Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
13961347

13971348
Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
1398-
1399-
if (Sse41.X64.IsSupported)
1400-
{
1401-
// Use PEXTRQ instruction if available, since it can extract from the vector directly to the destination address.
1402-
Unsafe.WriteUnaligned<ulong>(pAsciiBuffer, Sse41.X64.Extract(asciiVector.AsUInt64(), 0));
1403-
}
1404-
else
1405-
{
1406-
// Bounce this through a temporary register (with potential stack spillage) before writing to memory.
1407-
Unsafe.WriteUnaligned<ulong>(pAsciiBuffer, asciiVector.AsUInt64().GetElement(0));
1408-
}
1349+
Sse2.StoreLow((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
14091350

14101351
nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
14111352

@@ -1444,16 +1385,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pA
14441385

14451386
// Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
14461387
asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
1447-
1448-
// See comments earlier in this method for information about how this works.
1449-
if (Sse41.X64.IsSupported)
1450-
{
1451-
Unsafe.WriteUnaligned<ulong>(pAsciiBuffer + currentOffsetInElements, Sse41.X64.Extract(asciiVector.AsUInt64(), 0));
1452-
}
1453-
else
1454-
{
1455-
Unsafe.WriteUnaligned<ulong>(pAsciiBuffer + currentOffsetInElements, asciiVector.AsUInt64().GetElement(0));
1456-
}
1388+
Sse2.StoreLow((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
14571389
}
14581390

14591391
// Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
@@ -1529,26 +1461,12 @@ private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pA
15291461

15301462
Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
15311463

1532-
// See comments earlier in this method for information about how this works.
1533-
if (Sse41.X64.IsSupported)
1534-
{
1535-
*(ulong*)(pAsciiBuffer + currentOffsetInElements) = Sse41.X64.Extract(asciiVector.AsUInt64(), 0);
1536-
}
1537-
else
1538-
{
1539-
*(ulong*)(pAsciiBuffer + currentOffsetInElements) = asciiVector.AsUInt64().GetElement(0);
1540-
}
1464+
Sse2.StoreLow((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
15411465
currentOffsetInElements += SizeOfVector128 / 2;
15421466

15431467
goto Finish;
15441468
}
15451469

1546-
/// <summary>
1547-
/// Rotates a <see cref="uint"/> left. The JIT is smart enough to turn this into a ROL / ROR instruction.
1548-
/// </summary>
1549-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1550-
private static uint ROL32(uint value, int shift) => (value << shift) | (value >> (32 - shift));
1551-
15521470
/// <summary>
15531471
/// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
15541472
/// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered

0 commit comments

Comments
 (0)