Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 43a5159

Browse files
Refactor Encoding to split fast-path and fallback logic (#23098)
This refactoring is limited to ASCIIEncoding at the moment, but it can easily be applied to UTF-8 / UTF-16 / UTF-32. High-level changes: - Fallback logic has been split from the fast-path, improving performance of GetBytes and similar routines. - All of the plumbing of when to invoke the fallback logic and how to manage leftover data has been moved into the base class. - Almost all of the logic except for the fast-path is now written in terms of verifiable code (Span and ReadOnlySpan). - Minor bug fixes in EncoderNLS.Convert (see https://github.com/dotnet/coreclr/issues/23020).
1 parent 60773aa commit 43a5159

File tree

13 files changed

+2584
-658
lines changed

13 files changed

+2584
-658
lines changed

src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,7 @@
761761
<Compile Include="$(MSBuildThisFileDirectory)System\StringSplitOptions.cs" />
762762
<Compile Include="$(MSBuildThisFileDirectory)System\SystemException.cs" />
763763
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIEncoding.cs" />
764+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.cs" />
764765
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilderCache.cs" />
765766
<Compile Include="$(MSBuildThisFileDirectory)System\Text\CodePageDataItem.cs" />
766767
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Decoder.cs" />
@@ -776,6 +777,7 @@
776777
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncoderFallback.cs" />
777778
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncoderReplacementFallback.cs" />
778779
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Encoding.cs" />
780+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Encoding.Internal.cs" />
779781
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingData.cs" />
780782
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingInfo.cs" />
781783
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingNLS.cs" />

src/System.Private.CoreLib/shared/System/String.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ internal static unsafe string CreateStringFromEncoding(
480480
Debug.Assert(byteLength >= 0);
481481

482482
// Get our string length
483-
int stringLength = encoding.GetCharCount(bytes, byteLength, null);
483+
int stringLength = encoding.GetCharCount(bytes, byteLength);
484484
Debug.Assert(stringLength >= 0, "stringLength >= 0");
485485

486486
// They gave us an empty string if they needed one
@@ -491,7 +491,7 @@ internal static unsafe string CreateStringFromEncoding(
491491
string s = FastAllocateString(stringLength);
492492
fixed (char* pTempChars = &s._firstChar)
493493
{
494-
int doubleCheck = encoding.GetChars(bytes, byteLength, pTempChars, stringLength, null);
494+
int doubleCheck = encoding.GetChars(bytes, byteLength, pTempChars, stringLength);
495495
Debug.Assert(stringLength == doubleCheck,
496496
"Expected encoding.GetChars to return same length as encoding.GetCharCount");
497497
}

src/System.Private.CoreLib/shared/System/Text/ASCIIEncoding.cs

Lines changed: 520 additions & 608 deletions
Large diffs are not rendered by default.
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System.Runtime.CompilerServices;
6+
7+
namespace System.Text
8+
{
9+
/*
10+
* Contains naive unoptimized (non-SIMD) implementations of ASCII transcoding
11+
* operations. Vectorized methods can be substituted here as a drop-in replacement.
12+
*/
13+
14+
internal unsafe static class ASCIIUtility
15+
{
16+
[MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
17+
public static uint GetIndexOfFirstNonAsciiByte(byte* pBytes, uint byteCount)
18+
{
19+
uint idx = 0;
20+
for (; idx < byteCount; idx++)
21+
{
22+
if ((sbyte)pBytes[idx] < 0)
23+
{
24+
break;
25+
}
26+
}
27+
return idx;
28+
}
29+
30+
[MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
31+
public static uint GetIndexOfFirstNonAsciiChar(char* pChars, uint charCount)
32+
{
33+
uint idx = 0;
34+
for (; idx < charCount; idx++)
35+
{
36+
if (pChars[idx] > 0x7Fu)
37+
{
38+
break;
39+
}
40+
}
41+
return idx;
42+
}
43+
44+
[MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
45+
public static uint NarrowUtf16ToAscii(char* pChars, byte* pBytes, uint elementCount)
46+
{
47+
uint idx = 0;
48+
for (; idx < elementCount; idx++)
49+
{
50+
uint ch = pChars[idx];
51+
if (ch > 0x7Fu)
52+
{
53+
break;
54+
}
55+
pBytes[idx] = (byte)ch;
56+
}
57+
return idx;
58+
}
59+
60+
[MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
61+
public static uint WidenAsciiToUtf16(byte* pBytes, char* pChars, uint elementCount)
62+
{
63+
uint idx = 0;
64+
for (; idx < elementCount; idx++)
65+
{
66+
byte b = pBytes[idx];
67+
if (b > 0x7F)
68+
{
69+
break;
70+
}
71+
pChars[idx] = (char)b;
72+
}
73+
return idx;
74+
}
75+
}
76+
}

src/System.Private.CoreLib/shared/System/Text/DecoderFallback.cs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ public virtual void Reset()
6767
internal unsafe byte* byteStart;
6868
internal unsafe char* charEnd;
6969

70+
internal Encoding _encoding;
71+
internal DecoderNLS _decoder;
72+
private int _originalByteCount;
73+
7074
// Internal Reset
7175
internal unsafe void InternalReset()
7276
{
@@ -82,6 +86,22 @@ internal unsafe void InternalInitialize(byte* byteStart, char* charEnd)
8286
this.charEnd = charEnd;
8387
}
8488

89+
internal static DecoderFallbackBuffer CreateAndInitialize(Encoding encoding, DecoderNLS decoder, int originalByteCount)
90+
{
91+
// The original byte count is only used for keeping track of what 'index' value needs
92+
// to be passed to the abstract Fallback method. The index value is calculated by subtracting
93+
// 'bytes.Length' (where bytes is expected to be the entire remaining input buffer)
94+
// from the 'originalByteCount' value specified here.
95+
96+
DecoderFallbackBuffer fallbackBuffer = (decoder is null) ? encoding.DecoderFallback.CreateFallbackBuffer() : decoder.FallbackBuffer;
97+
98+
fallbackBuffer._encoding = encoding;
99+
fallbackBuffer._decoder = decoder;
100+
fallbackBuffer._originalByteCount = originalByteCount;
101+
102+
return fallbackBuffer;
103+
}
104+
85105
// Fallback the current byte by sticking it into the remaining char buffer.
86106
// This can only be called by our encodings (other have to use the public fallback methods), so
87107
// we can use our DecoderNLS here too (except we don't).
@@ -191,6 +211,90 @@ internal unsafe virtual int InternalFallback(byte[] bytes, byte* pBytes)
191211
return 0;
192212
}
193213

214+
internal int InternalFallbackGetCharCount(ReadOnlySpan<byte> remainingBytes, int fallbackLength)
215+
{
216+
return (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length))
217+
? DrainRemainingDataForGetCharCount()
218+
: 0;
219+
}
220+
221+
internal bool TryInternalFallbackGetChars(ReadOnlySpan<byte> remainingBytes, int fallbackLength, Span<char> chars, out int charsWritten)
222+
{
223+
if (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length))
224+
{
225+
return TryDrainRemainingDataForGetChars(chars, out charsWritten);
226+
}
227+
else
228+
{
229+
// Return true because we weren't asked to write anything, so this is a "success" in the sense that
230+
// the output buffer was large enough to hold the desired 0 chars of output.
231+
232+
charsWritten = 0;
233+
return true;
234+
}
235+
}
236+
237+
private Rune GetNextRune()
238+
{
239+
// Call GetNextChar() and try treating it as a non-surrogate character.
240+
// If that fails, call GetNextChar() again and attempt to treat the two chars
241+
// as a surrogate pair. If that still fails, throw an exception since the fallback
242+
// mechanism is giving us a bad replacement character.
243+
244+
Rune rune;
245+
char ch = GetNextChar();
246+
if (!Rune.TryCreate(ch, out rune) && !Rune.TryCreate(ch, GetNextChar(), out rune))
247+
{
248+
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
249+
}
250+
251+
return rune;
252+
}
253+
254+
internal int DrainRemainingDataForGetCharCount()
255+
{
256+
int totalCharCount = 0;
257+
258+
Rune thisRune;
259+
while ((thisRune = GetNextRune()).Value != 0)
260+
{
261+
// We need to check for overflow while tallying the fallback char count.
262+
263+
totalCharCount += thisRune.Utf16SequenceLength;
264+
if (totalCharCount < 0)
265+
{
266+
InternalReset();
267+
Encoding.ThrowConversionOverflow();
268+
}
269+
}
270+
271+
return totalCharCount;
272+
}
273+
274+
internal bool TryDrainRemainingDataForGetChars(Span<char> chars, out int charsWritten)
275+
{
276+
int originalCharCount = chars.Length;
277+
278+
Rune thisRune;
279+
while ((thisRune = GetNextRune()).Value != 0)
280+
{
281+
if (thisRune.TryEncode(chars, out int charsWrittenJustNow))
282+
{
283+
chars = chars.Slice(charsWrittenJustNow);
284+
continue;
285+
}
286+
else
287+
{
288+
InternalReset();
289+
charsWritten = default;
290+
return false;
291+
}
292+
}
293+
294+
charsWritten = originalCharCount - chars.Length;
295+
return true;
296+
}
297+
194298
// private helper methods
195299
internal void ThrowLastBytesRecursive(byte[] bytesUnknown)
196300
{

0 commit comments

Comments
 (0)