diff --git a/src/Compilers/CSharp/Portable/Parser/AbstractLexer.cs b/src/Compilers/CSharp/Portable/Parser/AbstractLexer.cs index 6eb8346d6fdd3..43006797668e2 100644 --- a/src/Compilers/CSharp/Portable/Parser/AbstractLexer.cs +++ b/src/Compilers/CSharp/Portable/Parser/AbstractLexer.cs @@ -11,24 +11,27 @@ namespace Microsoft.CodeAnalysis.CSharp.Syntax.InternalSyntax // separate out text windowing implementation (keeps scanning & lexing functions from abusing details) internal class AbstractLexer : IDisposable { - internal readonly SlidingTextWindow TextWindow; + /// + /// Not readonly. This is a mutable struct that will be modified as we lex tokens. + /// + internal SlidingTextWindow TextWindow; + private List? _errors; + protected int LexemeStartPosition; protected AbstractLexer(SourceText text) { this.TextWindow = new SlidingTextWindow(text); } - protected int LexemeStartPosition => this.TextWindow.LexemeStartPosition; - public virtual void Dispose() { - this.TextWindow.Dispose(); + this.TextWindow.Free(); } protected void Start() { - TextWindow.Start(); + LexemeStartPosition = this.TextWindow.Position; _errors = null; } @@ -135,10 +138,10 @@ private int GetLexemeOffsetFromPosition(int position) } protected string GetNonInternedLexemeText() - => TextWindow.GetText(intern: false); + => TextWindow.GetText(LexemeStartPosition, intern: false); protected string GetInternedLexemeText() - => TextWindow.GetText(intern: true); + => TextWindow.GetText(LexemeStartPosition, intern: true); protected int CurrentLexemeWidth => this.TextWindow.Position - LexemeStartPosition; diff --git a/src/Compilers/CSharp/Portable/Parser/Lexer.cs b/src/Compilers/CSharp/Portable/Parser/Lexer.cs index 0304312bc20a5..84601ad926b9f 100644 --- a/src/Compilers/CSharp/Portable/Parser/Lexer.cs +++ b/src/Compilers/CSharp/Portable/Parser/Lexer.cs @@ -1341,21 +1341,17 @@ private bool ScanIdentifier_FastPath(ref TokenInfo info) return false; } - var currentOffset = TextWindow.Offset; - var characterWindow = TextWindow.CharacterWindow; - var characterWindowCount = TextWindow.CharacterWindowCount; - - var startOffset = currentOffset; + var textWindowCharSpan = this.TextWindow.CurrentWindowSpan; + var currentIndex = 0; while (true) { - if (currentOffset == characterWindowCount) - { - // no more contiguous characters. Fall back to slow path + // If we do not not have any more contiguous characters within the char span that we can look at, + // then fall back to slow path + if (currentIndex == textWindowCharSpan.Length) return false; - } - switch (characterWindow[currentOffset]) + switch (textWindowCharSpan[currentIndex]) { case '&': // CONSIDER: This method is performance critical, so @@ -1405,13 +1401,13 @@ private bool ScanIdentifier_FastPath(ref TokenInfo info) // All of the following characters are not valid in an // identifier. If we see any of them, then we know we're // done. - var length = currentOffset - startOffset; + var length = currentIndex; TextWindow.AdvanceChar(length); - info.Text = info.StringValue = TextWindow.Intern(characterWindow, startOffset, length); + info.Text = info.StringValue = TextWindow.Intern(textWindowCharSpan[..length]); info.IsVerbatim = false; return true; case >= '0' and <= '9': - if (currentOffset == startOffset) + if (currentIndex == 0) { return false; } @@ -1423,7 +1419,7 @@ private bool ScanIdentifier_FastPath(ref TokenInfo info) case '_': // All of these characters are valid inside an identifier. // consume it and keep processing. - currentOffset++; + currentIndex++; continue; // case '@': verbatim identifiers are handled in the slow path @@ -2293,6 +2289,8 @@ private void ScanToEndOfLine() /// A trivia node with the whitespace text private SyntaxTrivia ScanWhitespace() { + Debug.Assert(SyntaxFacts.IsWhitespace(TextWindow.PeekChar())); + int hashCode = Hash.FnvOffsetBias; // FNV base bool onlySpaces = true; @@ -2326,6 +2324,8 @@ private SyntaxTrivia ScanWhitespace() break; } + Debug.Assert(this.CurrentLexemeWidth > 0); + if (this.CurrentLexemeWidth == 1 && onlySpaces) { return SyntaxFactory.Space; @@ -2336,24 +2336,18 @@ private SyntaxTrivia ScanWhitespace() if (width < MaxCachedTokenSize) { - return _cache.LookupTrivia( - TextWindow.CharacterWindow.AsSpan(TextWindow.LexemeRelativeStart, width), - hashCode, - CreateWhitespaceTrivia, - TextWindow); + return _cache.LookupWhitespaceTrivia( + TextWindow, + this.LexemeStartPosition, + hashCode); } else { - return CreateWhitespaceTrivia(TextWindow); + return SyntaxFactory.Whitespace(this.GetInternedLexemeText()); } } } - private static SyntaxTrivia CreateWhitespaceTrivia(SlidingTextWindow textWindow) - { - return SyntaxFactory.Whitespace(textWindow.GetText(intern: true)); - } - private void LexDirectiveAndExcludedTrivia( bool isFollowingToken, ref SyntaxListBuilder triviaList) diff --git a/src/Compilers/CSharp/Portable/Parser/LexerCache.cs b/src/Compilers/CSharp/Portable/Parser/LexerCache.cs index af20e88e0f25a..c44fc2450eec7 100644 --- a/src/Compilers/CSharp/Portable/Parser/LexerCache.cs +++ b/src/Compilers/CSharp/Portable/Parser/LexerCache.cs @@ -5,8 +5,10 @@ // #define COLLECT_STATS using System; +using System.Diagnostics; using Microsoft.CodeAnalysis.PooledObjects; using Microsoft.CodeAnalysis.Syntax.InternalSyntax; +using Microsoft.CodeAnalysis.Text; using Roslyn.Utilities; namespace Microsoft.CodeAnalysis.CSharp.Syntax.InternalSyntax @@ -181,21 +183,28 @@ internal bool TryGetKeywordKind(string key, out SyntaxKind kind) return kind != SyntaxKind.None; } - internal SyntaxTrivia LookupTrivia( - ReadOnlySpan textBuffer, - int hashCode, - Func createTriviaFunction, - TArg data) + internal SyntaxTrivia LookupWhitespaceTrivia( + in SlidingTextWindow textWindow, + int lexemeStartPosition, + int hashCode) { - var value = TriviaMap.FindItem(textBuffer, hashCode); + var span = TextSpan.FromBounds(lexemeStartPosition, textWindow.Position); + Debug.Assert(span.Length > 0); - if (value == null) + if (textWindow.TryGetTextIfWithinWindow(span, out var lexemeTextSpan)) { - value = createTriviaFunction(data); - TriviaMap.AddItem(textBuffer, hashCode, value); + var value = TriviaMap.FindItem(lexemeTextSpan, hashCode); + if (value == null) + { + value = SyntaxFactory.Whitespace(textWindow.GetText(lexemeStartPosition, intern: true)); + TriviaMap.AddItem(lexemeTextSpan, hashCode, value); + } + + return value; } - return value; + // Otherwise, if it's outside of the window, just grab from the underlying text. + return SyntaxFactory.Whitespace(textWindow.GetText(lexemeStartPosition, intern: true)); } // TODO: remove this when done tweaking this cache. diff --git a/src/Compilers/CSharp/Portable/Parser/Lexer_StringLiteral.cs b/src/Compilers/CSharp/Portable/Parser/Lexer_StringLiteral.cs index 09dcf9e03b64d..0da0c67d0f5a8 100644 --- a/src/Compilers/CSharp/Portable/Parser/Lexer_StringLiteral.cs +++ b/src/Compilers/CSharp/Portable/Parser/Lexer_StringLiteral.cs @@ -411,7 +411,7 @@ private bool ScanOpenQuote( out int startingQuoteCount) { // Handles reading the start of the interpolated string literal (up to where the content begins) - var window = _lexer.TextWindow; + ref var window = ref _lexer.TextWindow; var start = window.Position; if ((window.PeekChar(0), window.PeekChar(1), window.PeekChar(2)) is ('$', '@', '"') or ('@', '$', '"')) diff --git a/src/Compilers/CSharp/Portable/Parser/QuickScanner.cs b/src/Compilers/CSharp/Portable/Parser/QuickScanner.cs index 8c2719757aec3..d722814c193c0 100644 --- a/src/Compilers/CSharp/Portable/Parser/QuickScanner.cs +++ b/src/Compilers/CSharp/Portable/Parser/QuickScanner.cs @@ -193,19 +193,22 @@ private enum CharFlags : byte { this.Start(); var state = QuickScanState.Initial; - int i = TextWindow.Offset; - int n = TextWindow.CharacterWindowCount; - n = Math.Min(n, i + MaxCachedTokenSize); + + var textWindowCharSpan = TextWindow.CurrentWindowSpan; + + // Cap how much of the char span we're willing to look at. + textWindowCharSpan = textWindowCharSpan[..Math.Min(MaxCachedTokenSize, textWindowCharSpan.Length)]; int hashCode = Hash.FnvOffsetBias; //localize frequently accessed fields - var charWindow = TextWindow.CharacterWindow; var charPropLength = CharProperties.Length; - for (; i < n; i++) + // Where we are currently pointing in the charWindow as we read in a character at a time. + var currentIndex = 0; + for (; currentIndex < textWindowCharSpan.Length; currentIndex++) { - char c = charWindow[i]; + char c = textWindowCharSpan[currentIndex]; int uc = unchecked((int)c); var flags = uc < charPropLength ? (CharFlags)CharProperties[uc] : CharFlags.Complex; @@ -228,14 +231,22 @@ private enum CharFlags : byte state = QuickScanState.Bad; // ran out of characters in window exitWhile: - TextWindow.AdvanceChar(i - TextWindow.Offset); Debug.Assert(state == QuickScanState.Bad || state == QuickScanState.Done, "can only exit with Bad or Done"); if (state == QuickScanState.Done) { // this is a good token! + var tokenLength = currentIndex; + + Debug.Assert(tokenLength > 0); + + // It is fine to advance text window here. AdvanceChar is doc'ed to not change charWindow in any way. + // Note: we need to advance here, instead of after LookupToken as LookupToken can call into CreateQuickToken + // as a callback, which expects the text window to be in the position after lexing has occurred. + TextWindow.AdvanceChar(tokenLength); + var token = _cache.LookupToken( - TextWindow.CharacterWindow.AsSpan(TextWindow.LexemeRelativeStart, i - TextWindow.LexemeRelativeStart), + textWindowCharSpan[..tokenLength], hashCode, CreateQuickToken, this); @@ -243,7 +254,6 @@ private enum CharFlags : byte } else { - TextWindow.Reset(TextWindow.LexemeStartPosition); return null; } } @@ -251,9 +261,9 @@ private enum CharFlags : byte private static SyntaxToken CreateQuickToken(Lexer lexer) { #if DEBUG - var quickWidth = lexer.TextWindow.Width; + var quickWidth = lexer.CurrentLexemeWidth; #endif - lexer.TextWindow.Reset(lexer.TextWindow.LexemeStartPosition); + lexer.TextWindow.Reset(lexer.LexemeStartPosition); var token = lexer.LexSyntaxToken(); #if DEBUG Debug.Assert(quickWidth == token.FullWidth); diff --git a/src/Compilers/CSharp/Portable/Parser/SlidingTextWindow.cs b/src/Compilers/CSharp/Portable/Parser/SlidingTextWindow.cs index fa5fd29df9885..bc1d2702e51e9 100644 --- a/src/Compilers/CSharp/Portable/Parser/SlidingTextWindow.cs +++ b/src/Compilers/CSharp/Portable/Parser/SlidingTextWindow.cs @@ -2,9 +2,14 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +#if DEBUG +#define TRACING +#endif + using System; using System.Diagnostics; using System.Text; +using System.Threading; using Microsoft.CodeAnalysis.PooledObjects; using Microsoft.CodeAnalysis.Text; using Roslyn.Utilities; @@ -12,14 +17,19 @@ namespace Microsoft.CodeAnalysis.CSharp.Syntax.InternalSyntax { /// - /// Keeps a sliding buffer over the SourceText of a file for the lexer. Also - /// provides the lexer with the ability to keep track of a current "lexeme" - /// by leaving a marker and advancing ahead the offset. The lexer can then - /// decide to "keep" the lexeme by erasing the marker, or abandon the current - /// lexeme by moving the offset back to the marker. + /// Keeps a sliding buffer over the SourceText of a file for the lexer. Generally, the sliding buffer will + /// almost always move forward a 'chunk' () at a time. However, the buffer + /// also supports moving backward to a previous location if needed. /// - internal sealed class SlidingTextWindow : IDisposable + [NonCopyable] + internal struct SlidingTextWindow { +#if TRACING + public static int GetTextInsideWindowCount = 0; + public static int GetTextOutsideWindowCount = 0; + public static long TotalTextSize = 0; +#endif + /// /// In many cases, e.g. PeekChar, we need the ability to indicate that there are /// no characters left and we have reached the end of the stream, or some other @@ -34,207 +44,190 @@ internal sealed class SlidingTextWindow : IDisposable /// public const char InvalidCharacter = char.MaxValue; - private const int DefaultWindowLength = 2048; + /// + /// This number was picked based on the roslyn codebase. If we look at parsing all files, + /// this window ensures that 97.4% of all characters grabbed are within the window. Note + /// that this is lower than expected, but heavily influenced by how heavily roslyn uses + /// large strings in its codebase to represent test content. With all these files, lexemes + /// average around 60 characters. If tests are excluded, then average lexeme length drops + /// to 25 characters, and the hit rate goes up to 98.4%. Because this hitrate is so high, + /// we don't bother to dynamically resize the window. Instead, when we hit the end, we just + /// grab in the next chunk of characters from the source text, as only one in every 65 lexemes + /// will need to go back to the source-text to read the full lexeme. + /// + public const int DefaultWindowLength = 4096; + + private static readonly ObjectPool s_windowPool = new ObjectPool(() => new char[DefaultWindowLength]); - private readonly SourceText _text; // Source of text to parse. - private int _basis; // Offset of the window relative to the SourceText start. - private int _offset; // Offset from the start of the window. - private readonly int _textEnd; // Absolute end position - private char[] _characterWindow; // Moveable window of chars from source text - private int _characterWindowCount; // # of valid characters in chars buffer + /// + /// Underlying source text we are lexing. This is the final truth of all characters. Chunks of + /// this text will be read into a character window as needed to allow fast contiguous access to + /// a portion of the file at a time (as many implementations have logarithmic + /// access time to characters). + /// + public SourceText Text { get; } - private int _lexemeStart; // Start of current lexeme relative to the window start. + /// + /// Absolute end position of . Attempts to read at or past this position will + /// produce . + /// + private readonly int _textEnd; - // Example for the above variables: - // The text starts at 0. - // The window onto the text starts at basis. - // The current character is at (basis + offset), AKA the current "Position". - // The current lexeme started at (basis + lexemeStart), which is <= (basis + offset) - // The current lexeme is the characters between the lexemeStart and the offset. + /// + /// The current position in that we are reading characters from. This is the absolute + /// position in the source text. This is not allowed to be negative. It is allowed to be greater than or + /// equal to + /// + private int _positionInText; - private readonly StringTable _strings; + /// + /// Current segment of readable characters. The backing store for this is an array given out by . + /// The length of this will normally be , but may be smaller if we are at the end of the file + /// and there are not enough characters left to fill the window. + /// + private ArraySegment _characterWindow; - private static readonly ObjectPool s_windowPool = new ObjectPool(() => new char[DefaultWindowLength]); + /// + /// Where the current character window starts in the source text. This is the absolute position in the source text. + /// In other words, if this is 2048, then that means it represents the chunk of characters starting at position 2048 + /// in the source text. _characterWindow.Count represents how large the chunk is. Characters + /// [0, _characterWindow.Count) are valid characters within the window, and represent + /// the chunk [_characterWindowStartPositionInText, CharacterWindowEndPositionInText) in . + /// + private int _characterWindowStartPositionInText; + +#if DEBUG + private bool _freed; +#endif + + private readonly StringTable _strings; public SlidingTextWindow(SourceText text) { - _text = text; - _basis = 0; - _offset = 0; + this.Text = text; _textEnd = text.Length; _strings = StringTable.GetInstance(); - _characterWindow = s_windowPool.Allocate(); - _lexemeStart = 0; + _characterWindow = new ArraySegment(s_windowPool.Allocate()); + + // Read the first chunk of the file into the character window. + this.ReadChunkAt(0); } - public void Dispose() + public void Free() { - if (_characterWindow != null) - { - s_windowPool.Free(_characterWindow); - _characterWindow = null!; - _strings.Free(); - } - } +#if DEBUG + Debug.Assert(!_freed); + _freed = true; +#endif - public SourceText Text => _text; + s_windowPool.Free(_characterWindow.Array!); + _strings.Free(); + } /// - /// The current absolute position in the text file. + /// Reads a chunk of characters from the underlying at the given position and places them + /// at the start of the character window. The character window's length will be set to the number of characters + /// read. + /// + /// Note: this does NOT set . We are just reading the characters into the window. The actual + /// position in the text is unchanged by this, and callers should set that if necessary. /// - public int Position + private void ReadChunkAt(int position) { - get - { - return _basis + _offset; - } + position = Math.Min(position, _textEnd); + + var amountToRead = Math.Min(_textEnd - position, DefaultWindowLength); + this.Text.CopyTo(position, _characterWindow.Array!, 0, amountToRead); + _characterWindowStartPositionInText = position; + _characterWindow = new(_characterWindow.Array!, 0, amountToRead); } /// - /// The current offset inside the window (relative to the window start). + /// The current absolute position in the text file. /// - public int Offset - { - get - { - return _offset; - } - } + public readonly int Position => _positionInText; /// - /// The buffer backing the current window. + /// Gets a view over the underlying characters that this window is currently pointing at. + /// The view starts at and contains as many legal characters from + /// that are available after that. This span may be empty. /// - public char[] CharacterWindow + public readonly ReadOnlySpan CurrentWindowSpan { get { - return _characterWindow; + var start = _positionInText - _characterWindowStartPositionInText; + // If we are outside the bounds of the current character window, then we cannot return a span around any of it. + if (start < 0 || start >= _characterWindow.Count) + return default; + + return _characterWindow.AsSpan(start); } } - /// - /// Returns the start of the current lexeme relative to the window start. + /// Similar to , except this represents the index (exclusive) of the last character + /// that encompases in . This is equal to + /// + 's . /// - public int LexemeRelativeStart - { - get - { - return _lexemeStart; - } - } + private readonly int CharacterWindowEndPositionInText => _characterWindowStartPositionInText + _characterWindow.Count; /// - /// Number of characters in the character window. + /// Returns true if is within the current character window, and thus the character at that position + /// can be read from the character window without going back to the underlying . /// - public int CharacterWindowCount + private readonly bool PositionIsWithinWindow(int position) { - get - { - return _characterWindowCount; - } + return position >= _characterWindowStartPositionInText && + position < this.CharacterWindowEndPositionInText; } /// - /// The absolute position of the start of the current lexeme in the given - /// SourceText. + /// Returns true if is within the current character window, and thus the sub-string corresponding to + /// that span can be read can be read from the character window without going back to the underlying . /// - public int LexemeStartPosition + public readonly bool SpanIsWithinWindow(TextSpan span) { - get - { - return _basis + _lexemeStart; - } + return span.Start >= _characterWindowStartPositionInText && + span.End <= this.CharacterWindowEndPositionInText; } /// - /// The number of characters in the current lexeme. + /// Returns the span of characters corresponding to from if + /// is entirely within bounds of the current character window (see ). Otherwise, returns . Used to allow + /// fast path access to a sequence of characters in the common case where they are in the window, or + /// having the caller fall back to a slower approach. Note: this does not mutate the window in any way, + /// it just reads from a segment of the character window if that is available, return default for + /// if not. /// - public int Width + public readonly bool TryGetTextIfWithinWindow(TextSpan span, out ReadOnlySpan textSpan) { - get + if (SpanIsWithinWindow(span)) { - return _offset - _lexemeStart; + textSpan = _characterWindow.AsSpan(span.Start - _characterWindowStartPositionInText, span.Length); + return true; } + + textSpan = default; + return false; } /// - /// Start parsing a new lexeme. + /// Moves this window to point at the given position in the text. This will ensure that reading characters + /// (and normally characters after it) will be fast. /// - public void Start() - { - _lexemeStart = _offset; - } - public void Reset(int position) { - // if position is within already read character range then just use what we have - int relative = position - _basis; - if (relative >= 0 && relative <= _characterWindowCount) - { - _offset = relative; - } - else - { - // we need to reread text buffer - int amountToRead = Math.Min(_text.Length, position + _characterWindow.Length) - position; - amountToRead = Math.Max(amountToRead, 0); - if (amountToRead > 0) - { - _text.CopyTo(position, _characterWindow, 0, amountToRead); - } - - _lexemeStart = 0; - _offset = 0; - _basis = position; - _characterWindowCount = amountToRead; - } - } + // Move us to that position. + _positionInText = Math.Min(position, _textEnd); - private bool MoreChars() - { - if (_offset >= _characterWindowCount) - { - if (this.Position >= _textEnd) - { - return false; - } - - // if lexeme scanning is sufficiently into the char buffer, - // then refocus the window onto the lexeme - if (_lexemeStart > (_characterWindowCount / 4)) - { - Array.Copy(_characterWindow, - _lexemeStart, - _characterWindow, - 0, - _characterWindowCount - _lexemeStart); - _characterWindowCount -= _lexemeStart; - _offset -= _lexemeStart; - _basis += _lexemeStart; - _lexemeStart = 0; - } - - if (_characterWindowCount >= _characterWindow.Length) - { - // grow char array, since we need more contiguous space - char[] oldWindow = _characterWindow; - char[] newWindow = new char[_characterWindow.Length * 2]; - Array.Copy(oldWindow, 0, newWindow, 0, _characterWindowCount); - s_windowPool.ForgetTrackedObject(oldWindow, newWindow); - _characterWindow = newWindow; - } - - int amountToRead = Math.Min(_textEnd - (_basis + _characterWindowCount), - _characterWindow.Length - _characterWindowCount); - _text.CopyTo(_basis + _characterWindowCount, - _characterWindow, - _characterWindowCount, - amountToRead); - _characterWindowCount += amountToRead; - return amountToRead > 0; - } + // if position is within already read character range then just use what we have + if (PositionIsWithinWindow(_positionInText)) + return; - return true; + // Otherwise, ensure that the character window contains this position so we can read characters directly + // from there. + this.ReadChunkAt(_positionInText); } /// @@ -243,24 +236,21 @@ private bool MoreChars() /// /// Comments and string literals are allowed to contain any Unicode character. /// - /// - internal bool IsReallyAtEnd() - { - return _offset >= _characterWindowCount && Position >= _textEnd; - } + public readonly bool IsReallyAtEnd() + => Position >= _textEnd; /// - /// Advance the current position by one. No guarantee that this - /// position is valid. + /// Advance the current position by one. No guarantee that this position is valid. This will not change the character window + /// in any way. Specifically, it will not create a new character window, nor will it change the contents of the current window. /// public void AdvanceChar() - { - _offset++; - } + => AdvanceChar(1); /// /// Advances the text window if it currently pointing at the character. Returns if it did advance, otherwise. + /// langword="true"/> if it did advance, otherwise. This can change the + /// character window if the is at the end of the current character window, and + /// peeking then needs to read in a new chunk of characters to compare to . /// public bool TryAdvance(char c) { @@ -272,12 +262,13 @@ public bool TryAdvance(char c) } /// - /// Advance the current position by n. No guarantee that this position - /// is valid. + /// Advance the current position by n. No guarantee that this position is valid. This will not change the character window + /// in any way. Specifically, it will not create a new character window, nor will it change the contents of the current window. /// public void AdvanceChar(int n) { - _offset += n; + _positionInText += n; + Debug.Assert(_positionInText >= 0, "Position in text cannot be negative."); } /// @@ -332,14 +323,17 @@ public char NextChar() /// public char PeekChar() { - if (_offset >= _characterWindowCount - && !MoreChars()) - { + if (IsReallyAtEnd()) return InvalidCharacter; - } - // N.B. MoreChars may update the offset. - return _characterWindow[_offset]; + var position = _positionInText; + + // If the position is outside of the bounds of the current character window, then update its contents to + // contain that position. + if (!PositionIsWithinWindow(position)) + this.ReadChunkAt(position); + + return _characterWindow.Array![position - _characterWindowStartPositionInText]; } /// @@ -353,41 +347,13 @@ public char PeekChar(int delta) { int position = this.Position; this.AdvanceChar(delta); - - char ch; - if (_offset >= _characterWindowCount - && !MoreChars()) - { - ch = InvalidCharacter; - } - else - { - // N.B. MoreChars may update the offset. - ch = _characterWindow[_offset]; - } - + var ch = PeekChar(); this.Reset(position); return ch; } public char PreviousChar() - { - Debug.Assert(this.Position > 0); - if (_offset > 0) - { - // The allowed region of the window that can be read is from 0 to _characterWindowCount (which _offset - // is in between). So as long as _offset is greater than 0, we can read the previous character directly - // from the current chunk of characters in the window. - return this.CharacterWindow[_offset - 1]; - } - - // The prior character isn't in the window (trying to read the current character caused us to - // read in the next chunk of text into the window, throwing out the preceding characters). - // Just go back to the source text to find this character. While more expensive, this should - // be rare given that most of the time we won't be calling this right after loading a new text - // chunk. - return this.Text[this.Position - 1]; - } + => PeekChar(-1); /// /// If the next characters in the window match the given string, @@ -409,87 +375,99 @@ internal bool AdvanceIfMatches(string desired) return true; } - public string Intern(StringBuilder text) + public readonly string Intern(StringBuilder text) { return _strings.Add(text); } - public string Intern(char[] array, int start, int length) - { - return _strings.Add(array.AsSpan(start, length)); - } + public readonly string Intern(char[] array, int start, int length) + => Intern(array.AsSpan(start, length)); - public string GetInternedText() - { - return this.Intern(_characterWindow, _lexemeStart, this.Width); - } + public readonly string Intern(ReadOnlySpan chars) + => _strings.Add(chars); - public string GetText(bool intern) - { - return this.GetText(this.LexemeStartPosition, this.Width, intern); - } + /// + /// Gets the text, in the range [startPosition, this.Position) from . + /// This will grab the text from the character window if it is within the bounds of the current + /// chunk, or from the underlying if it is not. + /// + public readonly string GetText(int startPosition, bool intern) + => this.GetText(startPosition, this.Position - startPosition, intern); - public string GetText(int position, int length, bool intern) + /// + /// Gets the text, in the range [position, position + length) from . + /// This will grab the text from the character window if it is within the bounds of the current + /// chunk, or from the underlying if it is not. + /// + public readonly string GetText(int position, int length, bool intern) { - int offset = position - _basis; + var span = new TextSpan(position, length); + +#if TRACING + Interlocked.Add(ref TotalTextSize, length); +#endif + + // If the chunk being grabbed is not within the bounds of what is in the character window, + // then grab it from the source text. See docs at top of file for details on how common + // this is. + if (!SpanIsWithinWindow(span)) + { +#if TRACING + Interlocked.Increment(ref GetTextOutsideWindowCount); +#endif + return this.Text.ToString(span); + } + +#if TRACING + Interlocked.Increment(ref GetTextInsideWindowCount); +#endif + + var offset = position - _characterWindowStartPositionInText; + var array = _characterWindow.Array!; // PERF: Whether interning or not, there are some frequently occurring // easy cases we can pick off easily. switch (length) { - case 0: - return string.Empty; + case 0: return string.Empty; case 1: - if (_characterWindow[offset] == ' ') - { - return " "; - } - if (_characterWindow[offset] == '\n') + switch (array[offset]) { - return "\n"; + case ' ': return " "; + case '\n': return "\n"; } + break; case 2: - char firstChar = _characterWindow[offset]; - if (firstChar == '\r' && _characterWindow[offset + 1] == '\n') - { - return "\r\n"; - } - if (firstChar == '/' && _characterWindow[offset + 1] == '/') + switch (array[offset], array[offset + 1]) { - return "//"; + case ('\r', '\n'): return "\r\n"; + case ('/', '/'): return "//"; } + break; case 3: - if (_characterWindow[offset] == '/' && _characterWindow[offset + 1] == '/' && _characterWindow[offset + 2] == ' ') + if (array[offset] == '/' && array[offset + 1] == '/' && array[offset + 2] == ' ') { return "// "; } + break; } - if (intern) - { - return this.Intern(_characterWindow, offset, length); - } - else - { - return new string(_characterWindow, offset, length); - } + return intern + ? this.Intern(array, offset, length) + : new string(array, offset, length); } - internal TestAccessor GetTestAccessor() - => new TestAccessor(this); - - internal readonly struct TestAccessor(SlidingTextWindow window) + public static class TestAccessor { - private readonly SlidingTextWindow _window = window; - - internal void SetDefaultCharacterWindow() - => _window._characterWindow = new char[DefaultWindowLength]; + public static int GetOffset(in SlidingTextWindow window) => window._positionInText - window._characterWindowStartPositionInText; + public static int GetCharacterWindowStartPositionInText(in SlidingTextWindow window) => window._characterWindowStartPositionInText; + public static ArraySegment GetCharacterWindow(in SlidingTextWindow window) => window._characterWindow; } } } diff --git a/src/Compilers/CSharp/Test/Syntax/LexicalAndXml/LexicalTests.cs b/src/Compilers/CSharp/Test/Syntax/LexicalAndXml/LexicalTests.cs index a2360d1bf4c0d..87ee09a07c453 100644 --- a/src/Compilers/CSharp/Test/Syntax/LexicalAndXml/LexicalTests.cs +++ b/src/Compilers/CSharp/Test/Syntax/LexicalAndXml/LexicalTests.cs @@ -4577,15 +4577,20 @@ disabled text 2 [Fact, WorkItem("https://github.com/dotnet/roslyn/issues/78593")] public void TestDotPrefixedNumberStartingAtStartOfSlidingTextWindow() { - // This test depends on the line endings for the file being \r\n to ensure the right contents lines up at - // the right locations. + // Make a file that looks effectively like: // - // It specifically validates what happens when we see `.0` at the start of the - // sliding text window, where the lexer tries to peek back one char to see if this - // is actually `..0` (a range expr) or `.0` (a floating point number). - var code = Resources.DotPrefixedNumberStartingAtStartOfSlidingTextWindow; - if (!code.Contains("\r\n")) - code = code.Replace("\n", "\r\n"); + // ////// + // ; + // ..0; + // + // We want both dots at the end of the window. When we lex the first dot, we'll peek ahead to see if we're + // on a number, and we don't want that to move the window forward. Then when we lex the second dot, we will + // peek forward, making the text window point at "0;". + // + // Because we will have seen `.0` we will attempt to peek backwards to see if the prior token was a dot. + // This will involve reading back a chunk that includes that dot. + var windowEnd = "\r\n;\r\n.."; + var code = new string('/', SlidingTextWindow.DefaultWindowLength - windowEnd.Length) + windowEnd + "0;"; var sourceText = SourceText.From(code); @@ -4594,10 +4599,6 @@ public void TestDotPrefixedNumberStartingAtStartOfSlidingTextWindow() using var lexer = new Lexer(sourceText, CSharpParseOptions.Default); - // Ensure we have a normal window size, not some larger array that another test created and cached in - // the window pool - lexer.TextWindow.GetTestAccessor().SetDefaultCharacterWindow(); - using var parser = new LanguageParser(lexer, oldTree: null, changes: null); Microsoft.CodeAnalysis.SyntaxTreeExtensions.VerifySource( @@ -4610,52 +4611,27 @@ public void TestDotPrefixedNumberStartingAtStartOfSlidingTextWindow() // (a dot token starting a number, right at the start of the character window). var lexer = new Lexer(sourceText, CSharpParseOptions.Default); - // Ensure we have a normal window size, not some larger array that another test created and cached in - // the window pool - lexer.TextWindow.GetTestAccessor().SetDefaultCharacterWindow(); - var mode = LexerMode.Syntax; - for (var i = 0; i < 1326; i++) - lexer.Lex(ref mode); - - // Lexer will read from index 0 in the arrray. - Assert.Equal(0, lexer.TextWindow.Offset); - - // We have 205 real chars in the window - Assert.Equal(205, lexer.TextWindow.CharacterWindowCount); - - // The lexer is at position 10199 in the file. - Assert.Equal(10199, lexer.TextWindow.Position); - - /// The 205 characters represent the final part of the doc - Assert.Equal(lexer.TextWindow.Text.Length, lexer.TextWindow.Position + lexer.TextWindow.CharacterWindowCount); - - // We're at the start of a token. - Assert.Equal(lexer.TextWindow.LexemeStartPosition, lexer.TextWindow.Position); - - // Ensure that the lexer's window is starting with the next FP number (".03") right at - // the start of the window. - Assert.True(lexer.TextWindow.CharacterWindow is ['.', '0', '3', ',', ..], $"Start of window was '{new string(lexer.TextWindow.CharacterWindow, 0, 4)}'"); - - var token = lexer.Lex(ref mode); - Assert.Equal(SyntaxKind.NumericLiteralToken, token.Kind); - Assert.Equal(3, token.FullWidth); - Assert.Equal(".03", token.ToString()); - - // But we moved 3 characters forward. - Assert.Equal(3, lexer.TextWindow.Offset); - - // We still have 205 real chars in the window - Assert.Equal(205, lexer.TextWindow.CharacterWindowCount); - - // The lexer position has moved 3 characters forward as well. - Assert.Equal(10202, lexer.TextWindow.Position); - - // We're at the start of a token. - Assert.Equal(lexer.TextWindow.LexemeStartPosition, lexer.TextWindow.Position); - - // Character window didn't changee. - Assert.True(lexer.TextWindow.CharacterWindow is ['.', '0', '3', ',', ..], $"Start of window was '{new string(lexer.TextWindow.CharacterWindow, 0, 4)}'"); + var token1 = lexer.Lex(ref mode); + Assert.Equal(SyntaxKind.SemicolonToken, token1.Kind); + Assert.Equal(SlidingTextWindow.DefaultWindowLength - 2, token1.FullWidth); + + Assert.Equal(SlidingTextWindow.DefaultWindowLength - 2, SlidingTextWindow.TestAccessor.GetOffset(lexer.TextWindow)); + var token2 = lexer.Lex(ref mode); + Assert.Equal(SyntaxKind.DotToken, token2.Kind); + + Assert.Equal(SlidingTextWindow.DefaultWindowLength - 1, SlidingTextWindow.TestAccessor.GetOffset(lexer.TextWindow)); + var token3 = lexer.Lex(ref mode); + Assert.Equal(SyntaxKind.DotToken, token3.Kind); + + // We will have jumped the window backwards to be able to read the prior dot. + Assert.Equal(code.IndexOf('.'), SlidingTextWindow.TestAccessor.GetCharacterWindowStartPositionInText(lexer.TextWindow)); + Assert.Equal(2, SlidingTextWindow.TestAccessor.GetOffset(lexer.TextWindow)); + Assert.StartsWith("..0;", SlidingTextWindow.TestAccessor.GetCharacterWindow(lexer.TextWindow).AsSpan().ToString()); + + var token4 = lexer.Lex(ref mode); + Assert.Equal(SyntaxKind.NumericLiteralToken, token4.Kind); + Assert.Equal("0", token4.ValueText); } } } diff --git a/src/Compilers/CSharp/Test/Syntax/LexicalAndXml/SyntaxTokenParserTests.cs b/src/Compilers/CSharp/Test/Syntax/LexicalAndXml/SyntaxTokenParserTests.cs index 7a3ae5b79fff7..081c938863a37 100644 --- a/src/Compilers/CSharp/Test/Syntax/LexicalAndXml/SyntaxTokenParserTests.cs +++ b/src/Compilers/CSharp/Test/Syntax/LexicalAndXml/SyntaxTokenParserTests.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -193,7 +193,7 @@ public void SkipForwardTo_PastDocumentEnd() var sourceText = SourceText.From("class C { }"); var parser = SyntaxFactory.CreateTokenParser(sourceText, TestOptions.Regular); parser.SkipForwardTo(100); - AssertToken(expectedKind: SyntaxKind.EndOfFileToken, expectedContextualKind: SyntaxKind.None, new TextSpan(100, 0), "", parser.ParseNextToken()); + AssertToken(expectedKind: SyntaxKind.EndOfFileToken, expectedContextualKind: SyntaxKind.None, new TextSpan(11, 0), "", parser.ParseNextToken()); } [Fact] diff --git a/src/Compilers/CSharp/Test/Syntax/Resources.resx b/src/Compilers/CSharp/Test/Syntax/Resources.resx index 37ec16cc49bde..d5101d3f5d885 100644 --- a/src/Compilers/CSharp/Test/Syntax/Resources.resx +++ b/src/Compilers/CSharp/Test/Syntax/Resources.resx @@ -1,6 +1,5 @@ -