Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,15 @@ private static ImmutableArray<Diagnostic> EmitRegexMethod(IndentedTextWriter wri
DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", rm.Code); // skip implicit root capture
writer.WriteLine();

writer.WriteLine($" protected override bool FindFirstChar()");
writer.WriteLine($" protected override void Scan(global::System.Text.RegularExpressions.Regex regex, global::System.ReadOnlySpan<char> text, int textstart, int prevlen, bool quick)");
writer.WriteLine($" {{");
writer.Indent += 4;
EmitScan(writer, rm, id);
writer.Indent -= 4;
writer.WriteLine($" }}");
writer.WriteLine();

writer.WriteLine($" private bool FindFirstChar(global::System.ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 4;
RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id);
Expand All @@ -233,7 +241,7 @@ private static ImmutableArray<Diagnostic> EmitRegexMethod(IndentedTextWriter wri
{
writer.WriteLine($" [global::System.Runtime.CompilerServices.SkipLocalsInit]");
}
writer.WriteLine($" protected override void Go()");
writer.WriteLine($" private bool Go(global::System.ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 4;
requiredHelpers |= EmitGo(writer, rm, id);
Expand Down Expand Up @@ -299,6 +307,115 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht)
}
}

private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id)
{
RegexOptions options = (RegexOptions)rm.Options;

// Emit locals initialization
//writer.WriteLine("this.quick = quick;");
//writer.WriteLine("base.runtextpos = textstart;");
//writer.WriteLine("base.runregex = regex;");
//writer.WriteLine("base.runtextstart = textstart;");
//writer.WriteLine("base.runtextbeg = textbeg;");
//writer.WriteLine("base.runtextend = textend;");
//writer.WriteLine("bool initialized = false;");
//writer.WriteLine();

//EmitTimeoutHandling();
//writer.WriteLine();

// Source generator doesn't support Right-To-Left so there is no need to add the sepcial bump logic.
// If Right-to-left is ever added to the source generator, then we would need to the logic to define
// bump, as well as stoppos
Debug.Assert((options & RegexOptions.RightToLeft) == 0);

EmitPrevLenCheck();
writer.WriteLine();

EmitMainSearchLoop();
writer.WriteLine();

return;

#pragma warning disable CS8321 // Local function is declared but never used
void EmitTimeoutHandling()
{
writer.WriteLine("// Handle timeout argument");
writer.WriteLine("_timeout = -1;");
writer.WriteLine("bool ignoreTimeout = global::System.Text.RegularExpressions.Regex.InfiniteMatchTimeout == timeout;");
using (EmitBlock(writer, "if (!ignoreTimeout)"))
{
writer.WriteLine("// We are using Environment.TickCount and not Stopwatch for performance reasons.");
writer.WriteLine("// Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt");
writer.WriteLine("// overflow it will still stay ahead of Environment.TickCount for comparisons made");
writer.WriteLine("// in DoCheckTimeout().");
writer.WriteLine("global::System.Text.RegularExpressions.Regex.ValidateMatchTimeout(timeout); // validate timeout as this could be called from user code due to being protected");
writer.WriteLine("_timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round;");
writer.WriteLine("_timeoutOccursAt = global::System.Environment.TickCount + _timeout;");
writer.WriteLine("_timeoutChecksToSkip = TimeoutCheckFrequency;");
Copy link
Member

@stephentoub stephentoub Feb 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The plan would be to emit fields onto the source gen'd class? #Resolved

}
}
#pragma warning restore CS8321 // Local function is declared but never used

void EmitPrevLenCheck()
{
writer.WriteLine("// If previous match was empty or failed, advance by one before matching.");
using (EmitBlock(writer, "if (prevlen == 0)"))
{
using (EmitBlock(writer, "if (textstart == text.Length)"))
{
writer.WriteLine("base.runmatch = global::System.Text.RegularExpressions.Match.Empty;");
writer.WriteLine("return;");
}
writer.WriteLine();
writer.WriteLine("base.runtextpos++;");
}
}

void EmitMainSearchLoop()
{
using (EmitBlock(writer, "while (true)"))
{
using (EmitBlock(writer, "if (FindFirstChar(text))"))
{
writer.WriteLine("base.CheckTimeout();");
writer.WriteLine();

writer.WriteLine("// If we got a match, we're done.");
using (EmitBlock(writer, "if (Go(text))"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not against Go returning a bool, but I'm also not clear why it's needed. Can't the caller of Scan see whether there was a match based on the state of the object in runmatch?

{
using (EmitBlock(writer, "if (quick)"))
{
writer.WriteLine("base.runmatch = null;");
Copy link
Member

@stephentoub stephentoub Feb 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question as elsewhere about nulling this out. Can we leave all management of runmatch to the regex library? i.e. before calling into Scan it ensures runmatch is initialized appropriately, and the Scan method doesn't interact with it directly, it just calls Capture and the library code ensures it's appropriately populated. If the regex library caller then decides to return the Match object to user code, it'll null out this field on the runner appropriately.

We want this loop to be as minimal as we can muster and also minimize interactions with all of this matching state to as little as possible, basically nothing.

For example, the current handling seems very defensive about the possibility that Go may have left some capture state left over from a previous iteration. What if it doesn't? Our Go generation code always adds an uncapture loop that uncaptures everything back to the initial crawl state in the event of a failed match, so can we just trust that to be the case, not have to reset runcrawlpos, etc.?

And the Tidy call below... if that's necessary, can we leave that up to the regex library caller? #Resolved

writer.WriteLine("return;");
}
writer.WriteLine();

writer.WriteLine("// base.runmatch!.Tidy(base.runtextpos);");
writer.WriteLine("return;");
}
writer.WriteLine();

writer.WriteLine("// Reset state for another iteration.");
writer.WriteLine("base.runtrackpos = base.runtrack!.Length;");
writer.WriteLine("base.runstackpos = base.runstack!.Length;");
writer.WriteLine("base.runcrawlpos = base.runcrawl!.Length;");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are any of these necessary?

}
writer.WriteLine();

writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we are done.");
using (EmitBlock(writer, "if (base.runtextpos == text.Length)"))
{
writer.WriteLine("base.runmatch = global::System.Text.RegularExpressions.Match.Empty;");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed? If we initialize runmatch in the regex lib prior to calling Scan, and if Go ensures it uncaptures everything in the event of a failed match, can we just leave runmatch unmodified, and the regex lib will see that runmatch doesn't contain a top-level capture and will know that means a failed match?

writer.WriteLine("return;");
}
writer.WriteLine();

writer.WriteLine("base.runtextpos++;");
}
}
}

/// <summary>Emits the body of the FindFirstChar override.</summary>
private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id)
{
Expand Down Expand Up @@ -347,7 +464,6 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ
{
case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive:
Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix));
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix);
break;

Expand All @@ -356,13 +472,11 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive:
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive:
Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitFixedSet();
break;

case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive:
Debug.Assert(code.FindOptimizations.LiteralAfterLoop is not null);
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitLiteralAfterAtomicLoop();
break;

Expand Down Expand Up @@ -463,7 +577,6 @@ bool EmitAnchors()
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
// to boost our position to the next line, and then continue normally with any searches.
writer.WriteLine("// Beginning-of-line anchor");
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
additionalDeclarations.Add("int beginning = base.runtextbeg;");
using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')"))
{
Expand Down Expand Up @@ -763,13 +876,15 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe
writer.WriteLine($"int end = start + {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};");
writer.WriteLine("base.Capture(0, start, end);");
writer.WriteLine("base.runtextpos = end;");
writer.WriteLine("return true;");
return requiredHelpers;

case RegexNodeKind.Empty:
// This case isn't common in production, but it's very common when first getting started with the
// source generator and seeing what happens as you add more to expressions. When approaching
// it from a learning perspective, this is very common, as it's the empty string you start with.
writer.WriteLine("base.Capture(0, base.runtextpos, base.runtextpos);");
writer.WriteLine("return true;");
return requiredHelpers;
}

Expand All @@ -781,7 +896,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe

// Declare some locals.
string sliceSpan = "slice";
writer.WriteLine("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;");
writer.WriteLine($"int original_pos = pos;");
bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm);
Expand Down Expand Up @@ -826,7 +940,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe
}
writer.WriteLine("base.runtextpos = pos;");
writer.WriteLine("base.Capture(0, original_pos, pos);");
writer.WriteLine("return;");
writer.WriteLine("return true;");
writer.WriteLine();

// We only get here in the code if the whole expression fails to match and jumps to
Expand All @@ -837,6 +951,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe
{
EmitUncaptureUntil("0");
}
writer.WriteLine("return false;");

// We're done with the match.

Expand All @@ -846,8 +961,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe
// And emit any required helpers.
if (additionalLocalFunctions.Count != 0)
{
writer.WriteLine("return;"); // not strictly necessary, just for readability

foreach (KeyValuePair<string, string[]> localFunctions in additionalLocalFunctions.OrderBy(k => k.Key))
{
writer.WriteLine();
Expand Down Expand Up @@ -2148,7 +2261,7 @@ void EmitBoundary(RegexNode node)
_ => "base.IsECMABoundary",
};

using (EmitBlock(writer, $"if ({call}(pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))"))
using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))"))
{
writer.WriteLine($"goto {doneLabel};");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ public static void CompileToAssembly(System.Text.RegularExpressions.RegexCompila
public string GroupNameFromNumber(int i) { throw null; }
public int GroupNumberFromName(string name) { throw null; }
protected void InitializeReferences() { }
public bool IsMatch(System.ReadOnlySpan<char> input) { throw null; }
public static bool IsMatch(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; }
public static bool IsMatch(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; }
public static bool IsMatch(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; }
public bool IsMatch(string input) { throw null; }
public bool IsMatch(string input, int startat) { throw null; }
public static bool IsMatch(string input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; }
Expand Down Expand Up @@ -330,17 +334,20 @@ protected void DoubleCrawl() { }
protected void DoubleStack() { }
protected void DoubleTrack() { }
protected void EnsureStorage() { }
protected abstract bool FindFirstChar();
protected abstract void Go();
protected abstract void InitTrackCount();
protected virtual bool FindFirstChar() { throw null; }
protected virtual void Go() { throw null; }
protected virtual void InitTrackCount() { throw null; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we delete our generated override of InitTrackCount in the source generator? What breaks if runtrackcount remains 0? I think that'll just mean that we end up with the default minimums enforced by InitializeForGo (or whatever it's now called). We should probably delete the override and then confirm the defaults make sense, and/or determine whether there's a different number we could be computing that would help presize this better. If memory serves, right now it's yielding the number of opcodes produced by RegexWriter that are backtracking... but that's no longer particularly relevant to how we use runtrack, and we don't use runstack at all... so there's a fair bit of waste here to be cleaned up.

protected bool IsBoundary(int index, int startpos, int endpos) { throw null; }
protected bool IsBoundary(System.ReadOnlySpan<char> inputSpan, int index, int startpos, int endpos) { throw null; }
protected bool IsECMABoundary(int index, int startpos, int endpos) { throw null; }
protected bool IsECMABoundary(System.ReadOnlySpan<char> inputSpan, int index, int startpos, int endpos) { throw null; }
protected bool IsMatched(int cap) { throw null; }
protected int MatchIndex(int cap) { throw null; }
protected int MatchLength(int cap) { throw null; }
protected int Popcrawl() { throw null; }
protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) { throw null; }
protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; }
protected internal virtual void Scan(System.Text.RegularExpressions.Regex regex, System.ReadOnlySpan<char> text, int textstart, int prevlen, bool quick) { throw null; }
Copy link
Member

@stephentoub stephentoub Feb 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As part of exposing this, we should consider whether we want to keep the names as close to the existing ones as possible, or come up with better ones, e.g. is it better for prevlen to be consistent with the existing Scan overloads that we don't actually think anyone uses, or is it better for prevlen to be named previousMatchLength, or something more descriptive like that. I don't have a strong opinion, just something to consider. #Resolved

protected void TransferCapture(int capnum, int uncapnum, int start, int end) { }
protected void Uncapture() { }
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace System.Text.RegularExpressions
/// </summary>
public class Capture
{
internal Capture(string text, int index, int length)
internal Capture(string? text, int index, int length)
{
Text = text;
Index = index;
Expand All @@ -19,27 +19,38 @@ internal Capture(string text, int index, int length)
/// <summary>Returns the position in the original string where the first character of captured substring was found.</summary>
public int Index { get; private protected set; }

/// <summary>
/// This method should only be called when the text for matching was sliced with a different beginning, so the resulting index of
/// the match is not from the start of the text, but instead the start of the slice. This method will add back that extra indices
/// to account for the original text beginning.
/// </summary>
/// <param name="beginning">The original text's beginning offset.</param>
internal void AddBeginningToIndex(int beginning)
{
Index += beginning;
}

/// <summary>Returns the length of the captured substring.</summary>
public int Length { get; private protected set; }

/// <summary>The original string</summary>
internal string Text { get; set; }
internal string? Text { get; set; }

/// <summary>Gets the captured substring from the input string.</summary>
/// <value>The substring that is captured by the match.</value>
public string Value => Text.Substring(Index, Length);
public string Value => Text is string text ? text.Substring(Index, Length) : string.Empty;

/// <summary>Gets the captured span from the input string.</summary>
/// <value>The span that is captured by the match.</value>
public ReadOnlySpan<char> ValueSpan => Text.AsSpan(Index, Length);
public ReadOnlySpan<char> ValueSpan => Text is string text ? text.AsSpan(Index, Length) : ReadOnlySpan<char>.Empty;

/// <summary>Returns the substring that was matched.</summary>
public override string ToString() => Value;

/// <summary>The substring to the left of the capture</summary>
internal ReadOnlyMemory<char> GetLeftSubstring() => Text.AsMemory(0, Index);
internal ReadOnlyMemory<char> GetLeftSubstring() => Text is string text ? text.AsMemory(0, Index) : ReadOnlyMemory<char>.Empty;

/// <summary>The substring to the right of the capture</summary>
internal ReadOnlyMemory<char> GetRightSubstring() => Text.AsMemory(Index + Length, Text.Length - Index - Length);
internal ReadOnlyMemory<char> GetRightSubstring() => Text is string text ? text.AsMemory(Index + Length, Text.Length - Index - Length) : ReadOnlyMemory<char>.Empty;
}
}
Loading