Skip to content

Commit eae73b7

Browse files
AlexRadchtarekgh
andauthored
Complex Rune enumeration over spans of UTF-16 and UTF-8 text (#111170)
* Added RunePosition * Revert solution file * Code review * RunePosition code review - Modified constructor to be `internal` and updated property assignments. - Enhanced `Equals` and `GetHashCode` methods for readability and performance. - Improved XML documentation for clarity and consistency with Unicode terminology. - Updated test methods to reflect changes in `RunePosition`, removing unnecessary assertions and adding new test cases. - Ensured enumerators correctly handle current position in Unicode data. * Added implementation of `IEnumerator<System.Text.RunePosition>` to `Utf16Enumerator` and `Utf8Enumerator` structures. * Move the APIs and tests to System.Runtime * Support Reset --------- Co-authored-by: Tarek Mahmoud Sayed <[email protected]> Co-authored-by: Tarek Mahmoud Sayed <[email protected]>
1 parent 640fa69 commit eae73b7

File tree

4 files changed

+618
-1
lines changed

4 files changed

+618
-1
lines changed

src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1212,6 +1212,7 @@
12121212
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Latin1Utility.Helpers.cs" />
12131213
<Compile Include="$(MSBuildThisFileDirectory)System\Text\NormalizationForm.cs" />
12141214
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Rune.cs" />
1215+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\RunePosition.cs" />
12151216
<Compile Include="$(MSBuildThisFileDirectory)System\Text\SpanLineEnumerator.cs" />
12161217
<Compile Include="$(MSBuildThisFileDirectory)System\Text\SpanRuneEnumerator.cs" />
12171218
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilder.cs" />
Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System.ComponentModel;
5+
using System.Buffers;
6+
using System.Collections.Generic;
7+
using System.Collections;
8+
9+
namespace System.Text;
10+
11+
/// <summary>
12+
/// Represents a position in Unicode data, allowing for deeper data inspection.
13+
/// </summary>
14+
/// <remarks>
15+
/// Invalid Unicode symbols will be represented by the <see cref="System.Text.Rune.ReplacementChar"/> value.
16+
/// </remarks>
17+
public readonly struct RunePosition : IEquatable<RunePosition>
18+
{
19+
/// <summary>
20+
/// Returns an enumeration of <see cref="RunePosition"/> from the provided span that allows deeper data inspection.
21+
/// </summary>
22+
/// <param name="span">The <see cref="ReadOnlySpan{T}"/> with Unicode data.</param>
23+
/// <returns>
24+
/// <see cref="Utf16Enumerator"/> to enumerate <see cref="RunePosition"/> from the provided span with UTF-16
25+
/// Unicode data.
26+
/// </returns>
27+
/// <remarks>
28+
/// Invalid Unicode symbols will be represented by <see cref="System.Text.Rune.ReplacementChar"/>
29+
/// value.
30+
/// </remarks>
31+
public static Utf16Enumerator EnumerateUtf16(ReadOnlySpan<char> span) => new(span);
32+
33+
/// <summary>
34+
/// Returns an enumeration of <see cref="RunePosition"/> from the provided span that allows deeper data inspection.
35+
/// </summary>
36+
/// <param name="span">The <see cref="ReadOnlySpan{T}"/> with Unicode data.</param>
37+
/// <returns>
38+
/// <see cref="Utf8Enumerator"/> to enumerate <see cref="RunePosition"/> from the provided span with UTF-8 Unicode
39+
/// data.
40+
/// </returns>
41+
/// <remarks>
42+
/// Invalid Unicode symbols will be represented by <see cref="Rune.ReplacementChar"/> value.
43+
/// </remarks>
44+
public static Utf8Enumerator EnumerateUtf8(ReadOnlySpan<byte> span) => new(span);
45+
46+
/// <summary>
47+
/// Unicode scalar value <see cref="System.Text.Rune"/> of the current symbol in Unicode data.
48+
/// Invalid Unicode symbols will be represented by <see cref="System.Text.Rune.ReplacementChar"/> value.
49+
/// </summary>
50+
public Rune Rune { get; }
51+
52+
/// <summary>
53+
/// The index of current symbol in Unicode data.
54+
/// </summary>
55+
public int StartIndex { get; }
56+
57+
/// <summary>
58+
/// The length of current symbol in Unicode data.
59+
/// </summary>
60+
public int Length { get; }
61+
62+
/// <summary>
63+
/// <see langword="false"/> it current Unicode symbol is correct encoded and <see cref="RunePosition.Rune"/>
64+
/// contain its scalar value.
65+
/// <br />
66+
/// <see langword="true"/> if current Unicode symbol is invalid encoded and <see cref="RunePosition.Rune"/> was
67+
/// replaced by <see cref="System.Text.Rune.ReplacementChar"/> value.
68+
/// </summary>
69+
public bool WasReplaced { get; }
70+
71+
/// <summary>
72+
/// Initializes a new instance of the <see cref="RunePosition"/> struct.
73+
/// </summary>
74+
/// <param name="rune">The Unicode scalar value.</param>
75+
/// <param name="startIndex">The index of the current symbol in Unicode data.</param>
76+
/// <param name="length">The length of the current symbol in Unicode data.</param>
77+
/// <param name="wasReplaced">Indicates if the current Unicode symbol was replaced.</param>
78+
public RunePosition(Rune rune, int startIndex, int length, bool wasReplaced)
79+
{
80+
if (startIndex < 0)
81+
{
82+
throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_NeedNonNegNum);
83+
}
84+
85+
if ((uint)length > Rune.MaxUtf8BytesPerRune)
86+
{
87+
throw new ArgumentOutOfRangeException(nameof(length), SR.ArgumentOutOfRange_NeedNonNegNum);
88+
}
89+
90+
StartIndex = startIndex;
91+
Length = length;
92+
Rune = rune;
93+
WasReplaced = wasReplaced;
94+
}
95+
96+
/// <summary>
97+
/// Determines whether the specified <see cref="RunePosition"/> is equal to the current <see cref="RunePosition"/>.
98+
/// </summary>
99+
/// <param name="other">The other <see cref="RunePosition"/> to compare with.</param>
100+
/// <returns>
101+
/// <see langword="true"/> if the specified <see cref="RunePosition"/> is equal to the current
102+
/// <see cref="RunePosition"/>; otherwise, <see langword="false"/>.
103+
/// </returns>
104+
public bool Equals(RunePosition other) =>
105+
Rune == other.Rune && StartIndex == other.StartIndex && Length == other.Length && WasReplaced == other.WasReplaced;
106+
107+
/// <summary>
108+
/// Determines whether the specified <see cref="object"/> is equal to the current <see cref="RunePosition"/>.
109+
/// </summary>
110+
/// <param name="obj">The object to compare with the current <see cref="RunePosition"/>.</param>
111+
/// <returns>
112+
/// <see langword="true"/> if the specified <see cref="object"/> is equal to the current
113+
/// <see cref="RunePosition"/>; otherwise, <see langword="false"/>.
114+
/// </returns>
115+
public override bool Equals(object? obj) =>
116+
obj is RunePosition runePosition && Equals(runePosition);
117+
118+
/// <summary>
119+
/// Returns the hash code for the current <see cref="RunePosition"/>.
120+
/// </summary>
121+
/// <returns>The hash code for the current <see cref="RunePosition"/>.</returns>
122+
public override int GetHashCode() =>
123+
HashCode.Combine(Rune, StartIndex, Length, WasReplaced);
124+
125+
/// <summary>
126+
/// Deconstructs the <see cref="RunePosition"/> into its components.
127+
/// </summary>
128+
/// <param name="rune">The Unicode scalar value.</param>
129+
/// <param name="startIndex">The index of the current symbol in Unicode data.</param>
130+
[EditorBrowsable(EditorBrowsableState.Never)]
131+
public void Deconstruct(out Rune rune, out int startIndex)
132+
{
133+
rune = Rune;
134+
startIndex = StartIndex;
135+
}
136+
137+
/// <summary>
138+
/// Deconstructs the <see cref="RunePosition"/> into its components.
139+
/// </summary>
140+
/// <param name="rune">The Unicode scalar value.</param>
141+
/// <param name="startIndex">The index of the current symbol in Unicode data.</param>
142+
/// <param name="length">The length of the current symbol in Unicode data.</param>
143+
[EditorBrowsable(EditorBrowsableState.Never)]
144+
public void Deconstruct(out Rune rune, out int startIndex, out int length)
145+
{
146+
rune = Rune;
147+
startIndex = StartIndex;
148+
length = Length;
149+
}
150+
151+
/// <summary>
152+
/// Determines whether two specified <see cref="RunePosition"/> instances are equal.
153+
/// </summary>
154+
/// <param name="left">The first <see cref="RunePosition"/> to compare.</param>
155+
/// <param name="right">The second <see cref="RunePosition"/> to compare.</param>
156+
/// <returns>
157+
/// <see langword="true"/> if the two <see cref="RunePosition"/> instances are equal; otherwise,
158+
/// <see langword="false"/>.
159+
/// </returns>
160+
public static bool operator ==(RunePosition left, RunePosition right) => left.Equals(right);
161+
162+
/// <summary>
163+
/// Determines whether two specified <see cref="RunePosition"/> instances are not equal.
164+
/// </summary>
165+
/// <param name="left">The first <see cref="RunePosition"/> to compare.</param>
166+
/// <param name="right">The second <see cref="RunePosition"/> to compare.</param>
167+
/// <returns>
168+
/// <see langword="true"/> if the two <see cref="RunePosition"/> instances are not equal; otherwise,
169+
/// <see langword="false"/>.
170+
/// </returns>
171+
public static bool operator !=(RunePosition left, RunePosition right) => !(left == right);
172+
173+
/// <summary>
174+
/// An enumerator for retrieving <see cref="RunePosition"/> instances from Unicode data.
175+
/// </summary>
176+
/// <remarks>
177+
/// Methods are pattern-matched by compiler to allow using foreach pattern.
178+
/// </remarks>
179+
public ref struct Utf16Enumerator : IEnumerator<RunePosition>
180+
{
181+
private ReadOnlySpan<char> _original;
182+
private ReadOnlySpan<char> _remaining;
183+
184+
/// <summary>
185+
/// The current <see cref="RunePosition"/> in the Unicode data.
186+
/// </summary>
187+
public RunePosition Current { get; private set; }
188+
189+
/// <summary>
190+
/// Returns the current enumerator instance.
191+
/// </summary>
192+
/// <returns>The current enumerator instance.</returns>
193+
public Utf16Enumerator GetEnumerator() => this;
194+
195+
internal Utf16Enumerator(ReadOnlySpan<char> buffer)
196+
{
197+
_original = _remaining = buffer;
198+
Current = default;
199+
}
200+
201+
/// <summary>
202+
/// Moves to the next <see cref="RunePosition"/> in the Unicode data.
203+
/// </summary>
204+
/// <returns>
205+
/// <see langword="true"/> if the enumerator was successfully advanced to the next <see cref="RunePosition"/>;
206+
/// <br />
207+
/// <see langword="false"/> if the enumerator has passed the end of the span.</returns>
208+
public bool MoveNext()
209+
{
210+
if (_remaining.IsEmpty)
211+
{
212+
// reached the end of the buffer
213+
Current = default;
214+
return false;
215+
}
216+
217+
// In UTF-16 specifically, invalid sequences always have length 1, which is the same
218+
// length as the replacement character U+FFFD. This means that we can always bump the
219+
// next index by the current scalar's UTF-16 sequence length. This optimization is not
220+
// generally applicable; for example, enumerating scalars from UTF-8 cannot utilize
221+
// this same trick.
222+
223+
int scalarValue = Rune.ReadFirstRuneFromUtf16Buffer(_remaining);
224+
if (scalarValue >= 0)
225+
{
226+
Rune rune = Rune.UnsafeCreate((uint)scalarValue);
227+
int length = rune.Utf16SequenceLength;
228+
Current = new RunePosition(rune, Current.StartIndex + Current.Length, length, false);
229+
_remaining = _remaining.Slice(length);
230+
}
231+
else
232+
{
233+
Current = new RunePosition(Rune.ReplacementChar, Current.StartIndex + Current.Length, 1, true);
234+
_remaining = _remaining.Slice(1);
235+
}
236+
return true;
237+
}
238+
239+
public void Reset()
240+
{
241+
_remaining = _original;
242+
Current = default;
243+
}
244+
245+
object IEnumerator.Current => Current;
246+
247+
void IEnumerator.Reset() => Reset();
248+
249+
void IDisposable.Dispose() { }
250+
}
251+
252+
/// <summary>
253+
/// An enumerator for retrieving <see cref="RunePosition"/> instances from Unicode data.
254+
/// </summary>
255+
/// <remarks>
256+
/// Methods are pattern-matched by compiler to allow using foreach pattern.
257+
/// </remarks>
258+
public ref struct Utf8Enumerator : IEnumerator<RunePosition>
259+
{
260+
private ReadOnlySpan<byte> _original;
261+
private ReadOnlySpan<byte> _remaining;
262+
263+
/// <summary>
264+
/// The current <see cref="RunePosition"/> in the Unicode data.
265+
/// </summary>
266+
public RunePosition Current { get; private set; }
267+
268+
/// <summary>
269+
/// Returns the current enumerator instance.
270+
/// </summary>
271+
/// <returns>The current enumerator instance.</returns>
272+
public Utf8Enumerator GetEnumerator() => this;
273+
274+
/// <summary>
275+
/// Initializes a new instance of the <see cref="Utf8Enumerator"/> struct.
276+
/// </summary>
277+
/// <param name="buffer">The buffer containing the Unicode data.</param>
278+
internal Utf8Enumerator(ReadOnlySpan<byte> buffer)
279+
{
280+
_original = _remaining = buffer;
281+
Current = default;
282+
}
283+
284+
/// <summary>
285+
/// Moves to the next <see cref="RunePosition"/> in the Unicode data.
286+
/// </summary>
287+
/// <returns>
288+
/// <see langword="true"/> if the enumerator was successfully advanced to the next <see cref="RunePosition"/>;
289+
/// <br />
290+
/// <see langword="false"/> if the enumerator has passed the end of the span.
291+
/// </returns>
292+
public bool MoveNext()
293+
{
294+
if (_remaining.IsEmpty)
295+
{
296+
// reached the end of the buffer
297+
Current = default;
298+
return false;
299+
}
300+
301+
bool wasReplaced = Rune.DecodeFromUtf8(_remaining, out Rune rune, out int charsConsumed) != OperationStatus.Done;
302+
Current = new RunePosition(rune, Current.StartIndex + Current.Length, charsConsumed, wasReplaced);
303+
_remaining = _remaining.Slice(charsConsumed);
304+
return true;
305+
}
306+
307+
public void Reset()
308+
{
309+
_remaining = _original;
310+
Current = default;
311+
}
312+
313+
object IEnumerator.Current => Current;
314+
315+
void IEnumerator.Reset() => Reset();
316+
317+
void IDisposable.Dispose() { }
318+
}
319+
}

src/libraries/System.Runtime/ref/System.Runtime.cs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15720,6 +15720,48 @@ public enum NormalizationForm
1572015720
public bool TryEncodeToUtf8(System.Span<byte> destination, out int bytesWritten) { throw null; }
1572115721
public static bool TryGetRuneAt(string input, int index, out System.Text.Rune value) { throw null; }
1572215722
}
15723+
public readonly partial struct RunePosition : System.IEquatable<System.Text.RunePosition>
15724+
{
15725+
private readonly int _dummyPrimitive;
15726+
public static System.Text.RunePosition.Utf16Enumerator EnumerateUtf16(System.ReadOnlySpan<char> span) { throw null; }
15727+
public static System.Text.RunePosition.Utf8Enumerator EnumerateUtf8(System.ReadOnlySpan<byte> span) { throw null; }
15728+
public System.Text.Rune Rune { get { throw null; } }
15729+
public int StartIndex { get { throw null; } }
15730+
public int Length { get { throw null; } }
15731+
public bool WasReplaced { get { throw null; } }
15732+
public RunePosition(Rune rune, int startIndex, int length, bool wasReplaced) { throw null; }
15733+
public bool Equals(System.Text.RunePosition other) { throw null; }
15734+
public override bool Equals(object? obj) { throw null; }
15735+
public override int GetHashCode() { throw null; }
15736+
[System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
15737+
public void Deconstruct(out System.Text.Rune rune, out int startIndex) { throw null; }
15738+
[System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
15739+
public void Deconstruct(out System.Text.Rune rune, out int startIndex, out int length) { throw null; }
15740+
public static bool operator ==(System.Text.RunePosition left, System.Text.RunePosition right) { throw null; }
15741+
public static bool operator !=(System.Text.RunePosition left, System.Text.RunePosition right) { throw null; }
15742+
public ref partial struct Utf16Enumerator : System.Collections.Generic.IEnumerator<System.Text.RunePosition>, System.Collections.IEnumerator, System.IDisposable
15743+
{
15744+
private readonly int _dummyPrimitive;
15745+
public System.Text.RunePosition Current { get { throw null; } }
15746+
public System.Text.RunePosition.Utf16Enumerator GetEnumerator() { throw null; }
15747+
public bool MoveNext() { throw null; }
15748+
public void Reset() { throw null; }
15749+
object System.Collections.IEnumerator.Current { get { throw null; } }
15750+
void System.Collections.IEnumerator.Reset() { }
15751+
void System.IDisposable.Dispose() { }
15752+
}
15753+
public ref partial struct Utf8Enumerator: System.Collections.Generic.IEnumerator<System.Text.RunePosition>, System.Collections.IEnumerator, System.IDisposable
15754+
{
15755+
private readonly int _dummyPrimitive;
15756+
public System.Text.RunePosition Current { get { throw null; } }
15757+
public System.Text.RunePosition.Utf8Enumerator GetEnumerator() { throw null; }
15758+
public bool MoveNext() { throw null; }
15759+
public void Reset() { throw null; }
15760+
object System.Collections.IEnumerator.Current { get { throw null; } }
15761+
void System.Collections.IEnumerator.Reset() { }
15762+
void System.IDisposable.Dispose() { }
15763+
}
15764+
}
1572315765
public sealed partial class StringBuilder : System.Runtime.Serialization.ISerializable
1572415766
{
1572515767
public StringBuilder() { }

0 commit comments

Comments
 (0)