Skip to content
Merged
Show file tree
Hide file tree
Changes from 89 commits
Commits
Show all changes
96 commits
Select commit Hold shift + click to select a range
40ad7f0
Move field into lexer
CyrusNajmabadi Jun 30, 2025
0ea5fbc
more work
CyrusNajmabadi Jun 30, 2025
a472f31
Update cache
CyrusNajmabadi Jun 30, 2025
3d1fc7f
All done
CyrusNajmabadi Jun 30, 2025
fd45aec
Remove
CyrusNajmabadi Jun 30, 2025
16573f2
Doc
CyrusNajmabadi Jun 30, 2025
e1eb1b4
Fix
CyrusNajmabadi Jun 30, 2025
362c1cb
Fix
CyrusNajmabadi Jun 30, 2025
d8a8748
Make sure we disable tracing
CyrusNajmabadi Jun 30, 2025
712f2f5
Fix test
CyrusNajmabadi Jun 30, 2025
a852cfd
Fixup
CyrusNajmabadi Jun 30, 2025
8f278a3
Simplify
CyrusNajmabadi Jun 30, 2025
15fbaec
Add tracing
CyrusNajmabadi Jun 30, 2025
daf9f80
Add tracing
CyrusNajmabadi Jun 30, 2025
ebf24ab
Add docs
CyrusNajmabadi Jun 30, 2025
54e6e27
remove
CyrusNajmabadi Jun 30, 2025
284cf83
remove
CyrusNajmabadi Jun 30, 2025
e9a740c
Fixup test
CyrusNajmabadi Jun 30, 2025
f6817ee
Simplify
CyrusNajmabadi Jun 30, 2025
1f0ec4a
Check both ends
CyrusNajmabadi Jun 30, 2025
e2885e4
Docs
CyrusNajmabadi Jun 30, 2025
7fc3b85
cleanup
CyrusNajmabadi Jun 30, 2025
cedf899
Inline method
CyrusNajmabadi Jun 30, 2025
a127c8f
Merge branch 'main' into lexerField
CyrusNajmabadi Jul 1, 2025
df0944b
Merge remote-tracking branch 'upstream/main' into lexerField
CyrusNajmabadi Jul 2, 2025
1441828
Delete
CyrusNajmabadi Jul 2, 2025
799edf7
Merge remote-tracking branch 'upstream/main' into lexerField
CyrusNajmabadi Jul 2, 2025
337ae17
Simplify
CyrusNajmabadi Jul 2, 2025
31d6b9c
In progress
CyrusNajmabadi Jul 2, 2025
596a146
Remove unused function
CyrusNajmabadi Jul 2, 2025
40a3098
Remove unused function
CyrusNajmabadi Jul 2, 2025
ad0183f
Merge branch 'removeFunction' into lexerField
CyrusNajmabadi Jul 2, 2025
b60a263
Merge branch 'useSpan' into lexerField
CyrusNajmabadi Jul 2, 2025
6a66e2a
Use span
CyrusNajmabadi Jul 2, 2025
522713c
Simplify
CyrusNajmabadi Jul 2, 2025
68b59a4
Fixup
CyrusNajmabadi Jul 2, 2025
7de725f
Simplify
CyrusNajmabadi Jul 2, 2025
d88102e
Updates
CyrusNajmabadi Jul 2, 2025
d964f29
Make private
CyrusNajmabadi Jul 2, 2025
3857e2d
In progress
CyrusNajmabadi Jul 2, 2025
dcd95b1
In progress
CyrusNajmabadi Jul 2, 2025
28ee138
In progress
CyrusNajmabadi Jul 2, 2025
8686e17
Remove field
CyrusNajmabadi Jul 2, 2025
cb19c83
Remove field
CyrusNajmabadi Jul 2, 2025
7eb1016
REmove offset
CyrusNajmabadi Jul 2, 2025
64984e5
remove property
CyrusNajmabadi Jul 2, 2025
68d625b
remove property
CyrusNajmabadi Jul 2, 2025
f577a9f
Docs
CyrusNajmabadi Jul 2, 2025
46a9aaa
Add helper
CyrusNajmabadi Jul 2, 2025
781eb9a
Building
CyrusNajmabadi Jul 2, 2025
583b578
Renames
CyrusNajmabadi Jul 2, 2025
8be8696
Add docs
CyrusNajmabadi Jul 2, 2025
45769cb
Test accessors
CyrusNajmabadi Jul 2, 2025
41dd1f4
Test accessors
CyrusNajmabadi Jul 2, 2025
4828ded
Fix
CyrusNajmabadi Jul 2, 2025
aaacaf2
Revert
CyrusNajmabadi Jul 2, 2025
8f2c2ae
fix
CyrusNajmabadi Jul 2, 2025
2c8c6c7
Cleanup
CyrusNajmabadi Jul 2, 2025
7dbd1de
Merge branch 'main' into lexerField
CyrusNajmabadi Jul 2, 2025
a1dba56
Update src/Compilers/CSharp/csc/Program.cs
CyrusNajmabadi Jul 2, 2025
7a2ab71
remove
CyrusNajmabadi Jul 2, 2025
c66a5c3
Merge branch 'lexerField' of https://github.com/CyrusNajmabadi/roslyn…
CyrusNajmabadi Jul 2, 2025
9b4941a
Make struct
CyrusNajmabadi Jul 2, 2025
b05a58c
Update src/Compilers/CSharp/csc/Program.cs
CyrusNajmabadi Jul 2, 2025
1f907ab
REvert
CyrusNajmabadi Jul 2, 2025
3c87760
Merge branch 'lexerField' of https://github.com/CyrusNajmabadi/roslyn…
CyrusNajmabadi Jul 2, 2025
362a47c
Remove from release
CyrusNajmabadi Jul 2, 2025
b69e574
Make readonly
CyrusNajmabadi Jul 2, 2025
7219354
Make readonly
CyrusNajmabadi Jul 2, 2025
91959bd
make not readonly
CyrusNajmabadi Jul 2, 2025
aa6e641
Fix
CyrusNajmabadi Jul 2, 2025
b4bc7a4
Fix tests
CyrusNajmabadi Jul 2, 2025
0491e62
Merge remote-tracking branch 'upstream/main' into lexerField
CyrusNajmabadi Jul 3, 2025
e417a13
ifdef
CyrusNajmabadi Jul 3, 2025
ae6d7a3
No disposable
CyrusNajmabadi Jul 3, 2025
9be1995
Docs
CyrusNajmabadi Jul 3, 2025
0620cbe
share
CyrusNajmabadi Jul 3, 2025
51cef33
Merge remote-tracking branch 'upstream/main' into lexerField
CyrusNajmabadi Jul 3, 2025
38772d9
Merge
CyrusNajmabadi Jul 3, 2025
39f10e2
Simplify
CyrusNajmabadi Jul 3, 2025
a42ff59
Update src/Compilers/CSharp/Test/Syntax/Resources.resx
CyrusNajmabadi Jul 3, 2025
2e41f81
Merge branch 'main' into lexerField
CyrusNajmabadi Jul 9, 2025
64a3105
Merge branch 'main' into lexerField
CyrusNajmabadi Jul 17, 2025
37ce151
Docs
CyrusNajmabadi Jul 17, 2025
c47db70
Docs
CyrusNajmabadi Jul 17, 2025
61d4fd5
Merge remote-tracking branch 'upstream/main' into lexerField
CyrusNajmabadi Jul 17, 2025
4256939
Tweak wording
CyrusNajmabadi Jul 17, 2025
17e1edb
Merge branch 'main' into lexerField
CyrusNajmabadi Jul 21, 2025
de4388f
Merge branch 'main' into lexerField
CyrusNajmabadi Jul 23, 2025
a21a493
Apply suggestions from code review
CyrusNajmabadi Jul 23, 2025
061f4e2
Merge remote-tracking branch 'upstream/main' into lexerField
CyrusNajmabadi Jul 23, 2025
b301919
Don't return invalid span
CyrusNajmabadi Jul 23, 2025
636a56b
Add assertion
CyrusNajmabadi Jul 23, 2025
d8367f7
Remove 'else'
CyrusNajmabadi Jul 23, 2025
82ee10a
Use clamped position
CyrusNajmabadi Jul 23, 2025
fa199ec
Check both sides
CyrusNajmabadi Jul 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions src/Compilers/CSharp/Portable/Parser/AbstractLexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,27 @@ namespace Microsoft.CodeAnalysis.CSharp.Syntax.InternalSyntax
// separate out text windowing implementation (keeps scanning & lexing functions from abusing details)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would view this PR with whitespace off.

internal class AbstractLexer : IDisposable
{
internal readonly SlidingTextWindow TextWindow;
/// <summary>
/// Not readonly. This is a mutable struct that will be modified as we lex tokens.
/// </summary>
internal SlidingTextWindow TextWindow;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Documented later. But this makes it so that there really need to be practically no indirections between the lexer and the chars being read msot of hte time.


private List<SyntaxDiagnosticInfo>? _errors;
protected int LexemeStartPosition;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved to lexer-layer from SlidingTextWindow


protected AbstractLexer(SourceText text)
{
this.TextWindow = new SlidingTextWindow(text);
}

protected int LexemeStartPosition => this.TextWindow.LexemeStartPosition;

public virtual void Dispose()
{
this.TextWindow.Dispose();
this.TextWindow.Free();
}

protected void Start()
{
TextWindow.Start();
LexemeStartPosition = this.TextWindow.Position;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the text window is now blissfully unaware of what the lexer is doing with lexemes.

_errors = null;
}

Expand Down Expand Up @@ -135,10 +138,10 @@ private int GetLexemeOffsetFromPosition(int position)
}

protected string GetNonInternedLexemeText()
=> TextWindow.GetText(intern: false);
=> TextWindow.GetText(LexemeStartPosition, intern: false);

protected string GetInternedLexemeText()
=> TextWindow.GetText(intern: true);
=> TextWindow.GetText(LexemeStartPosition, intern: true);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these .GetText helpers on the TextWindow were used all over the place. MOved to two simple helpers on Lexer for getting either an interned or non-interned chunk of text for the current lexeme.


protected int CurrentLexemeWidth
=> this.TextWindow.Position - LexemeStartPosition;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was previously exposed as the (imo) confusingly named SlidingTextWindow.Width.

I tried to ensure that helpers related to lexemes used that term in their name to keep it clear what 'width/count/etc.' things are referring to.

Expand Down
44 changes: 19 additions & 25 deletions src/Compilers/CSharp/Portable/Parser/Lexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1341,21 +1341,17 @@ private bool ScanIdentifier_FastPath(ref TokenInfo info)
return false;
}

var currentOffset = TextWindow.Offset;
var characterWindow = TextWindow.CharacterWindow;
var characterWindowCount = TextWindow.CharacterWindowCount;

var startOffset = currentOffset;
var textWindowCharSpan = this.TextWindow.CurrentWindowSpan;
var currentIndex = 0;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

window now just exposes things as a span, making streaming over it very easy for some of these fast-scan paths. Math gets much easier in these cases.


while (true)
{
if (currentOffset == characterWindowCount)
{
// no more contiguous characters. Fall back to slow path
// If we do not not have any more contiguous characters within the char span that we can look at,
// then fall back to slow path
if (currentIndex == textWindowCharSpan.Length)
return false;
}

switch (characterWindow[currentOffset])
switch (textWindowCharSpan[currentIndex])
{
case '&':
// CONSIDER: This method is performance critical, so
Expand Down Expand Up @@ -1405,13 +1401,13 @@ private bool ScanIdentifier_FastPath(ref TokenInfo info)
// All of the following characters are not valid in an
// identifier. If we see any of them, then we know we're
// done.
var length = currentOffset - startOffset;
var length = currentIndex;
TextWindow.AdvanceChar(length);
info.Text = info.StringValue = TextWindow.Intern(characterWindow, startOffset, length);
info.Text = info.StringValue = TextWindow.Intern(textWindowCharSpan[..length]);
info.IsVerbatim = false;
return true;
case >= '0' and <= '9':
if (currentOffset == startOffset)
if (currentIndex == 0)
{
return false;
}
Expand All @@ -1423,7 +1419,7 @@ private bool ScanIdentifier_FastPath(ref TokenInfo info)
case '_':
// All of these characters are valid inside an identifier.
// consume it and keep processing.
currentOffset++;
currentIndex++;
continue;

// case '@': verbatim identifiers are handled in the slow path
Expand Down Expand Up @@ -2293,6 +2289,8 @@ private void ScanToEndOfLine()
/// <returns>A trivia node with the whitespace text</returns>
private SyntaxTrivia ScanWhitespace()
{
Debug.Assert(SyntaxFacts.IsWhitespace(TextWindow.PeekChar()));

int hashCode = Hash.FnvOffsetBias; // FNV base
bool onlySpaces = true;

Expand Down Expand Up @@ -2326,6 +2324,8 @@ private SyntaxTrivia ScanWhitespace()
break;
}

Debug.Assert(this.CurrentLexemeWidth > 0);

if (this.CurrentLexemeWidth == 1 && onlySpaces)
{
return SyntaxFactory.Space;
Expand All @@ -2336,24 +2336,18 @@ private SyntaxTrivia ScanWhitespace()

if (width < MaxCachedTokenSize)
{
return _cache.LookupTrivia(
TextWindow.CharacterWindow.AsSpan(TextWindow.LexemeRelativeStart, width),
hashCode,
CreateWhitespaceTrivia,
TextWindow);
return _cache.LookupWhitespaceTrivia(
TextWindow,
this.LexemeStartPosition,
hashCode);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved off of a generic method (that was only ever called here) into a completely reified method.

}
else
{
return CreateWhitespaceTrivia(TextWindow);
return SyntaxFactory.Whitespace(this.GetInternedLexemeText());
}
}
}

private static SyntaxTrivia CreateWhitespaceTrivia(SlidingTextWindow textWindow)
{
return SyntaxFactory.Whitespace(textWindow.GetText(intern: true));
}

private void LexDirectiveAndExcludedTrivia(
bool isFollowingToken,
ref SyntaxListBuilder triviaList)
Expand Down
29 changes: 19 additions & 10 deletions src/Compilers/CSharp/Portable/Parser/LexerCache.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
// #define COLLECT_STATS

using System;
using System.Diagnostics;
using Microsoft.CodeAnalysis.PooledObjects;
using Microsoft.CodeAnalysis.Syntax.InternalSyntax;
using Microsoft.CodeAnalysis.Text;
using Roslyn.Utilities;

namespace Microsoft.CodeAnalysis.CSharp.Syntax.InternalSyntax
Expand Down Expand Up @@ -181,21 +183,28 @@ internal bool TryGetKeywordKind(string key, out SyntaxKind kind)
return kind != SyntaxKind.None;
}

internal SyntaxTrivia LookupTrivia<TArg>(
ReadOnlySpan<char> textBuffer,
int hashCode,
Func<TArg, SyntaxTrivia> createTriviaFunction,
TArg data)
internal SyntaxTrivia LookupWhitespaceTrivia(
in SlidingTextWindow textWindow,
int lexemeStartPosition,
int hashCode)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

easier to just read as the new function. There was only ever one caller ofr LookupTrivia, and it always used the same generic arg.

{
var value = TriviaMap.FindItem(textBuffer, hashCode);
var span = TextSpan.FromBounds(lexemeStartPosition, textWindow.Position);
Debug.Assert(span.Length > 0);

if (value == null)
if (textWindow.TryGetTextIfWithinWindow(span, out var lexemeTextSpan))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note the user of spans here now, which is really nice for having the text window pass out the span of characters of interest, which then is exactly what the TriviaMap operates on.

{
value = createTriviaFunction(data);
TriviaMap.AddItem(textBuffer, hashCode, value);
var value = TriviaMap.FindItem(lexemeTextSpan, hashCode);
if (value == null)
{
value = SyntaxFactory.Whitespace(textWindow.GetText(lexemeStartPosition, intern: true));
TriviaMap.AddItem(lexemeTextSpan, hashCode, value);
}

return value;
}

return value;
// Otherwise, if it's outside of the window, just grab from the underlying text.
return SyntaxFactory.Whitespace(textWindow.GetText(lexemeStartPosition, intern: true));
}

// TODO: remove this when done tweaking this cache.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ private bool ScanOpenQuote(
out int startingQuoteCount)
{
// Handles reading the start of the interpolated string literal (up to where the content begins)
var window = _lexer.TextWindow;
ref var window = ref _lexer.TextWindow;
var start = window.Position;

if ((window.PeekChar(0), window.PeekChar(1), window.PeekChar(2)) is ('$', '@', '"') or ('@', '$', '"'))
Expand Down
32 changes: 21 additions & 11 deletions src/Compilers/CSharp/Portable/Parser/QuickScanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -193,19 +193,22 @@ private enum CharFlags : byte
{
this.Start();
var state = QuickScanState.Initial;
int i = TextWindow.Offset;
int n = TextWindow.CharacterWindowCount;
n = Math.Min(n, i + MaxCachedTokenSize);

var textWindowCharSpan = TextWindow.CurrentWindowSpan;

// Cap how much of the char span we're willing to look at.
textWindowCharSpan = textWindowCharSpan[..Math.Min(MaxCachedTokenSize, textWindowCharSpan.Length)];
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar place where spans make this all much simpler (and imo clearer).


int hashCode = Hash.FnvOffsetBias;

//localize frequently accessed fields
var charWindow = TextWindow.CharacterWindow;
var charPropLength = CharProperties.Length;

for (; i < n; i++)
// Where we are currently pointing in the charWindow as we read in a character at a time.
var currentIndex = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we put this inside the for initializer?

Copy link
Member Author

@CyrusNajmabadi CyrusNajmabadi Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no. it is read below (it is used to figure out how many characters were actually read out).

for (; currentIndex < textWindowCharSpan.Length; currentIndex++)
{
char c = charWindow[i];
char c = textWindowCharSpan[currentIndex];
int uc = unchecked((int)c);

var flags = uc < charPropLength ? (CharFlags)CharProperties[uc] : CharFlags.Complex;
Expand All @@ -228,32 +231,39 @@ private enum CharFlags : byte
state = QuickScanState.Bad; // ran out of characters in window
exitWhile:

TextWindow.AdvanceChar(i - TextWindow.Offset);
Debug.Assert(state == QuickScanState.Bad || state == QuickScanState.Done, "can only exit with Bad or Done");

if (state == QuickScanState.Done)
{
// this is a good token!
var tokenLength = currentIndex;

Debug.Assert(tokenLength > 0);

// It is fine to advance text window here. AdvanceChar is doc'ed to not change charWindow in any way.
// Note: we need to advance here, instead of after LookupToken as LookupToken can call into CreateQuickToken
// as a callback, which expects the text window to be in the position after lexing has occurred.
TextWindow.AdvanceChar(tokenLength);

var token = _cache.LookupToken(
TextWindow.CharacterWindow.AsSpan(TextWindow.LexemeRelativeStart, i - TextWindow.LexemeRelativeStart),
textWindowCharSpan[..tokenLength],
hashCode,
CreateQuickToken,
this);
return token;
}
else
{
TextWindow.Reset(TextWindow.LexemeStartPosition);
Copy link
Member Author

@CyrusNajmabadi CyrusNajmabadi Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to reset anything. the text window didn't move. this function only ever reads directly from the backing char array.

return null;
}
}

private static SyntaxToken CreateQuickToken(Lexer lexer)
{
#if DEBUG
var quickWidth = lexer.TextWindow.Width;
var quickWidth = lexer.CurrentLexemeWidth;
#endif
lexer.TextWindow.Reset(lexer.TextWindow.LexemeStartPosition);
lexer.TextWindow.Reset(lexer.LexemeStartPosition);
var token = lexer.LexSyntaxToken();
#if DEBUG
Debug.Assert(quickWidth == token.FullWidth);
Expand Down
Loading