-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Cleanup split between lexer and sliding-text-window #79205
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 89 commits
40ad7f0
0ea5fbc
a472f31
3d1fc7f
fd45aec
16573f2
e1eb1b4
362c1cb
d8a8748
712f2f5
a852cfd
8f278a3
15fbaec
daf9f80
ebf24ab
54e6e27
284cf83
e9a740c
f6817ee
1f0ec4a
e2885e4
7fc3b85
cedf899
a127c8f
df0944b
1441828
799edf7
337ae17
31d6b9c
596a146
40a3098
ad0183f
b60a263
6a66e2a
522713c
68b59a4
7de725f
d88102e
d964f29
3857e2d
dcd95b1
28ee138
8686e17
cb19c83
7eb1016
64984e5
68d625b
f577a9f
46a9aaa
781eb9a
583b578
8be8696
45769cb
41dd1f4
4828ded
aaacaf2
8f2c2ae
2c8c6c7
7dbd1de
a1dba56
7a2ab71
c66a5c3
9b4941a
b05a58c
1f907ab
3c87760
362a47c
b69e574
7219354
91959bd
aa6e641
b4bc7a4
0491e62
e417a13
ae6d7a3
9be1995
0620cbe
51cef33
38772d9
39f10e2
a42ff59
2e41f81
64a3105
37ce151
c47db70
61d4fd5
4256939
17e1edb
de4388f
a21a493
061f4e2
b301919
636a56b
d8367f7
82ee10a
fa199ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,24 +11,27 @@ namespace Microsoft.CodeAnalysis.CSharp.Syntax.InternalSyntax | |
| // separate out text windowing implementation (keeps scanning & lexing functions from abusing details) | ||
| internal class AbstractLexer : IDisposable | ||
| { | ||
| internal readonly SlidingTextWindow TextWindow; | ||
| /// <summary> | ||
| /// Not readonly. This is a mutable struct that will be modified as we lex tokens. | ||
| /// </summary> | ||
| internal SlidingTextWindow TextWindow; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Documented later. But this makes it so that there really need to be practically no indirections between the lexer and the chars being read msot of hte time. |
||
|
|
||
| private List<SyntaxDiagnosticInfo>? _errors; | ||
| protected int LexemeStartPosition; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. moved to lexer-layer from SlidingTextWindow |
||
|
|
||
| protected AbstractLexer(SourceText text) | ||
| { | ||
| this.TextWindow = new SlidingTextWindow(text); | ||
| } | ||
|
|
||
| protected int LexemeStartPosition => this.TextWindow.LexemeStartPosition; | ||
|
|
||
| public virtual void Dispose() | ||
| { | ||
| this.TextWindow.Dispose(); | ||
| this.TextWindow.Free(); | ||
| } | ||
|
|
||
| protected void Start() | ||
| { | ||
| TextWindow.Start(); | ||
| LexemeStartPosition = this.TextWindow.Position; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the text window is now blissfully unaware of what the lexer is doing with lexemes. |
||
| _errors = null; | ||
| } | ||
|
|
||
|
|
@@ -135,10 +138,10 @@ private int GetLexemeOffsetFromPosition(int position) | |
| } | ||
|
|
||
| protected string GetNonInternedLexemeText() | ||
| => TextWindow.GetText(intern: false); | ||
| => TextWindow.GetText(LexemeStartPosition, intern: false); | ||
|
|
||
| protected string GetInternedLexemeText() | ||
| => TextWindow.GetText(intern: true); | ||
| => TextWindow.GetText(LexemeStartPosition, intern: true); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these .GetText helpers on the TextWindow were used all over the place. MOved to two simple helpers on Lexer for getting either an interned or non-interned chunk of text for the current lexeme. |
||
|
|
||
| protected int CurrentLexemeWidth | ||
| => this.TextWindow.Position - LexemeStartPosition; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was previously exposed as the (imo) confusingly named SlidingTextWindow.Width. I tried to ensure that helpers related to lexemes used that term in their name to keep it clear what 'width/count/etc.' things are referring to. |
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1341,21 +1341,17 @@ private bool ScanIdentifier_FastPath(ref TokenInfo info) | |
| return false; | ||
| } | ||
|
|
||
| var currentOffset = TextWindow.Offset; | ||
| var characterWindow = TextWindow.CharacterWindow; | ||
| var characterWindowCount = TextWindow.CharacterWindowCount; | ||
|
|
||
| var startOffset = currentOffset; | ||
| var textWindowCharSpan = this.TextWindow.CurrentWindowSpan; | ||
| var currentIndex = 0; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. window now just exposes things as a span, making streaming over it very easy for some of these fast-scan paths. Math gets much easier in these cases. |
||
|
|
||
| while (true) | ||
| { | ||
| if (currentOffset == characterWindowCount) | ||
| { | ||
| // no more contiguous characters. Fall back to slow path | ||
| // If we do not not have any more contiguous characters within the char span that we can look at, | ||
| // then fall back to slow path | ||
| if (currentIndex == textWindowCharSpan.Length) | ||
| return false; | ||
| } | ||
|
|
||
| switch (characterWindow[currentOffset]) | ||
| switch (textWindowCharSpan[currentIndex]) | ||
| { | ||
| case '&': | ||
| // CONSIDER: This method is performance critical, so | ||
|
|
@@ -1405,13 +1401,13 @@ private bool ScanIdentifier_FastPath(ref TokenInfo info) | |
| // All of the following characters are not valid in an | ||
| // identifier. If we see any of them, then we know we're | ||
| // done. | ||
| var length = currentOffset - startOffset; | ||
| var length = currentIndex; | ||
| TextWindow.AdvanceChar(length); | ||
| info.Text = info.StringValue = TextWindow.Intern(characterWindow, startOffset, length); | ||
| info.Text = info.StringValue = TextWindow.Intern(textWindowCharSpan[..length]); | ||
| info.IsVerbatim = false; | ||
| return true; | ||
| case >= '0' and <= '9': | ||
| if (currentOffset == startOffset) | ||
| if (currentIndex == 0) | ||
| { | ||
| return false; | ||
| } | ||
|
|
@@ -1423,7 +1419,7 @@ private bool ScanIdentifier_FastPath(ref TokenInfo info) | |
| case '_': | ||
| // All of these characters are valid inside an identifier. | ||
| // consume it and keep processing. | ||
| currentOffset++; | ||
| currentIndex++; | ||
| continue; | ||
|
|
||
| // case '@': verbatim identifiers are handled in the slow path | ||
|
|
@@ -2293,6 +2289,8 @@ private void ScanToEndOfLine() | |
| /// <returns>A trivia node with the whitespace text</returns> | ||
| private SyntaxTrivia ScanWhitespace() | ||
| { | ||
| Debug.Assert(SyntaxFacts.IsWhitespace(TextWindow.PeekChar())); | ||
|
|
||
| int hashCode = Hash.FnvOffsetBias; // FNV base | ||
| bool onlySpaces = true; | ||
|
|
||
|
|
@@ -2326,6 +2324,8 @@ private SyntaxTrivia ScanWhitespace() | |
| break; | ||
| } | ||
|
|
||
| Debug.Assert(this.CurrentLexemeWidth > 0); | ||
|
|
||
| if (this.CurrentLexemeWidth == 1 && onlySpaces) | ||
| { | ||
| return SyntaxFactory.Space; | ||
|
|
@@ -2336,24 +2336,18 @@ private SyntaxTrivia ScanWhitespace() | |
|
|
||
| if (width < MaxCachedTokenSize) | ||
| { | ||
| return _cache.LookupTrivia( | ||
| TextWindow.CharacterWindow.AsSpan(TextWindow.LexemeRelativeStart, width), | ||
| hashCode, | ||
| CreateWhitespaceTrivia, | ||
| TextWindow); | ||
| return _cache.LookupWhitespaceTrivia( | ||
| TextWindow, | ||
| this.LexemeStartPosition, | ||
| hashCode); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. moved off of a generic method (that was only ever called here) into a completely reified method. |
||
| } | ||
| else | ||
| { | ||
| return CreateWhitespaceTrivia(TextWindow); | ||
| return SyntaxFactory.Whitespace(this.GetInternedLexemeText()); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private static SyntaxTrivia CreateWhitespaceTrivia(SlidingTextWindow textWindow) | ||
| { | ||
| return SyntaxFactory.Whitespace(textWindow.GetText(intern: true)); | ||
| } | ||
|
|
||
| private void LexDirectiveAndExcludedTrivia( | ||
| bool isFollowingToken, | ||
| ref SyntaxListBuilder triviaList) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,8 +5,10 @@ | |
| // #define COLLECT_STATS | ||
|
|
||
| using System; | ||
| using System.Diagnostics; | ||
| using Microsoft.CodeAnalysis.PooledObjects; | ||
| using Microsoft.CodeAnalysis.Syntax.InternalSyntax; | ||
| using Microsoft.CodeAnalysis.Text; | ||
| using Roslyn.Utilities; | ||
|
|
||
| namespace Microsoft.CodeAnalysis.CSharp.Syntax.InternalSyntax | ||
|
|
@@ -181,21 +183,28 @@ internal bool TryGetKeywordKind(string key, out SyntaxKind kind) | |
| return kind != SyntaxKind.None; | ||
| } | ||
|
|
||
| internal SyntaxTrivia LookupTrivia<TArg>( | ||
| ReadOnlySpan<char> textBuffer, | ||
| int hashCode, | ||
| Func<TArg, SyntaxTrivia> createTriviaFunction, | ||
| TArg data) | ||
| internal SyntaxTrivia LookupWhitespaceTrivia( | ||
| in SlidingTextWindow textWindow, | ||
| int lexemeStartPosition, | ||
| int hashCode) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. easier to just read as the new function. There was only ever one caller ofr LookupTrivia, and it always used the same generic arg. |
||
| { | ||
| var value = TriviaMap.FindItem(textBuffer, hashCode); | ||
| var span = TextSpan.FromBounds(lexemeStartPosition, textWindow.Position); | ||
| Debug.Assert(span.Length > 0); | ||
|
|
||
| if (value == null) | ||
| if (textWindow.TryGetTextIfWithinWindow(span, out var lexemeTextSpan)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note the user of spans here now, which is really nice for having the text window pass out the span of characters of interest, which then is exactly what the TriviaMap operates on. |
||
| { | ||
| value = createTriviaFunction(data); | ||
| TriviaMap.AddItem(textBuffer, hashCode, value); | ||
| var value = TriviaMap.FindItem(lexemeTextSpan, hashCode); | ||
| if (value == null) | ||
| { | ||
| value = SyntaxFactory.Whitespace(textWindow.GetText(lexemeStartPosition, intern: true)); | ||
| TriviaMap.AddItem(lexemeTextSpan, hashCode, value); | ||
| } | ||
|
|
||
| return value; | ||
| } | ||
|
|
||
| return value; | ||
| // Otherwise, if it's outside of the window, just grab from the underlying text. | ||
| return SyntaxFactory.Whitespace(textWindow.GetText(lexemeStartPosition, intern: true)); | ||
| } | ||
|
|
||
| // TODO: remove this when done tweaking this cache. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -193,19 +193,22 @@ private enum CharFlags : byte | |
| { | ||
| this.Start(); | ||
| var state = QuickScanState.Initial; | ||
| int i = TextWindow.Offset; | ||
| int n = TextWindow.CharacterWindowCount; | ||
| n = Math.Min(n, i + MaxCachedTokenSize); | ||
|
|
||
| var textWindowCharSpan = TextWindow.CurrentWindowSpan; | ||
|
|
||
| // Cap how much of the char span we're willing to look at. | ||
| textWindowCharSpan = textWindowCharSpan[..Math.Min(MaxCachedTokenSize, textWindowCharSpan.Length)]; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. similar place where spans make this all much simpler (and imo clearer). |
||
|
|
||
| int hashCode = Hash.FnvOffsetBias; | ||
|
|
||
| //localize frequently accessed fields | ||
| var charWindow = TextWindow.CharacterWindow; | ||
| var charPropLength = CharProperties.Length; | ||
|
|
||
| for (; i < n; i++) | ||
| // Where we are currently pointing in the charWindow as we read in a character at a time. | ||
| var currentIndex = 0; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we put this inside the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no. it is read below (it is used to figure out how many characters were actually read out). |
||
| for (; currentIndex < textWindowCharSpan.Length; currentIndex++) | ||
| { | ||
| char c = charWindow[i]; | ||
| char c = textWindowCharSpan[currentIndex]; | ||
| int uc = unchecked((int)c); | ||
|
|
||
| var flags = uc < charPropLength ? (CharFlags)CharProperties[uc] : CharFlags.Complex; | ||
|
|
@@ -228,32 +231,39 @@ private enum CharFlags : byte | |
| state = QuickScanState.Bad; // ran out of characters in window | ||
| exitWhile: | ||
|
|
||
| TextWindow.AdvanceChar(i - TextWindow.Offset); | ||
| Debug.Assert(state == QuickScanState.Bad || state == QuickScanState.Done, "can only exit with Bad or Done"); | ||
|
|
||
| if (state == QuickScanState.Done) | ||
| { | ||
| // this is a good token! | ||
| var tokenLength = currentIndex; | ||
|
|
||
| Debug.Assert(tokenLength > 0); | ||
|
|
||
| // It is fine to advance text window here. AdvanceChar is doc'ed to not change charWindow in any way. | ||
| // Note: we need to advance here, instead of after LookupToken as LookupToken can call into CreateQuickToken | ||
| // as a callback, which expects the text window to be in the position after lexing has occurred. | ||
| TextWindow.AdvanceChar(tokenLength); | ||
|
|
||
| var token = _cache.LookupToken( | ||
| TextWindow.CharacterWindow.AsSpan(TextWindow.LexemeRelativeStart, i - TextWindow.LexemeRelativeStart), | ||
| textWindowCharSpan[..tokenLength], | ||
| hashCode, | ||
| CreateQuickToken, | ||
| this); | ||
| return token; | ||
| } | ||
| else | ||
| { | ||
| TextWindow.Reset(TextWindow.LexemeStartPosition); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to reset anything. the text window didn't move. this function only ever reads directly from the backing char array. |
||
| return null; | ||
| } | ||
| } | ||
|
|
||
| private static SyntaxToken CreateQuickToken(Lexer lexer) | ||
| { | ||
| #if DEBUG | ||
| var quickWidth = lexer.TextWindow.Width; | ||
| var quickWidth = lexer.CurrentLexemeWidth; | ||
| #endif | ||
| lexer.TextWindow.Reset(lexer.TextWindow.LexemeStartPosition); | ||
| lexer.TextWindow.Reset(lexer.LexemeStartPosition); | ||
| var token = lexer.LexSyntaxToken(); | ||
| #if DEBUG | ||
| Debug.Assert(quickWidth == token.FullWidth); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i would view this PR with whitespace off.