Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RegexDiff] MihaZupan/runtime/regex-fewerUintCasts #613

Open
MihuBot opened this issue Sep 2, 2024 · 1 comment
Open

[RegexDiff] MihaZupan/runtime/regex-fewerUintCasts #613

MihuBot opened this issue Sep 2, 2024 · 1 comment

Comments

@MihuBot
Copy link
Owner

MihuBot commented Sep 2, 2024

Job completed in 11 minutes 29 seconds.

Using arguments: regexdiff -SkipJitDiff -UploadTestAssembly -NoPRLink

Examples of GeneratedRegex source diffs
"\\s+" (24455 uses)
[GeneratedRegex("\\s+", RegexOptions.None)]
         /// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
         internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
         
-        /// <summary>Supports searching for characters in or not in "\t\n\v\f\r \u0085             \u2028\u2029   ".</summary>
-        internal static readonly SearchValues<char> s_whitespace = SearchValues.Create("\t\n\v\f\r \u0085             \u2028\u2029   ");
+        /// <summary>Supports searching for characters in or not in "\t\n\v\f\r \u0085             \u2028\u2029   ".</summary>
+        internal static readonly SearchValues<char> s_whitespace = SearchValues.Create("\t\n\v\f\r \u0085             \u2028\u2029   ");
     }
 }
"[^'\",]+'[^^']+'|[^'\",]+\"[^\"]+\"|[^,]+" (21563 uses)
[GeneratedRegex("[^'\",]+'[^^']+'|[^'\",]+\"[^\"]+\"|[^,]+", RegexOptions.None)]
  }
  
  // Match '\''.
-   if ((uint)slice.Length < 2 || slice[1] != '\'')
+   if (slice.Length < 2 || slice[1] != '\'')
  {
      goto AlternationBranch;
  }
  }
  
  // Match '"'.
-   if ((uint)slice.Length < 2 || slice[1] != '"')
+   if (slice.Length < 2 || slice[1] != '"')
  {
      goto AlternationBranch1;
  }
"(?:,\"|^\")(\"\"|[\\w\\W]*?)(?=\",|\"$)|(?:, ..." (18474 uses)
[GeneratedRegex("(?:,\"|^\")(\"\"|[\\w\\W]*?)(?=\",|\"$)|(?:,(?!\")|^(?!\"))([^,]*?)(?=$|,)|(\r\n|\n)", RegexOptions.Compiled)]
  // Branch 0
  {
      // Match ','.
-       if ((uint)slice.Length < 2 || slice[1] != ',')
+       if (slice.Length < 2 || slice[1] != ',')
      {
          goto AlternationBranch3;
      }
      }
      
      // Match '"'.
-       if ((uint)slice.Length < 2 || slice[1] != '"')
+       if (slice.Length < 2 || slice[1] != '"')
      {
          goto NegativeLookaroundMatch;
      }
  {
      case '\r':
          // Match '\n'.
-           if ((uint)slice.Length < 2 || slice[1] != '\n')
+           if (slice.Length < 2 || slice[1] != '\n')
          {
              UncaptureUntil(0);
              return false; // The input didn't match.
"\\s" (10990 uses)
[GeneratedRegex("\\s", RegexOptions.CultureInvariant)]
         /// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
         internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
         
-        /// <summary>Supports searching for characters in or not in "\t\n\v\f\r \u0085             \u2028\u2029   ".</summary>
-        internal static readonly SearchValues<char> s_whitespace = SearchValues.Create("\t\n\v\f\r \u0085             \u2028\u2029   ");
+        /// <summary>Supports searching for characters in or not in "\t\n\v\f\r \u0085             \u2028\u2029   ".</summary>
+        internal static readonly SearchValues<char> s_whitespace = SearchValues.Create("\t\n\v\f\r \u0085             \u2028\u2029   ");
     }
 }
"^\\s*(((?<ORIGIN>(((\\d+>)?[a-zA-Z]?:[^:]*)| ..." (7826 uses)
[GeneratedRegex("^\\s*(((?<ORIGIN>(((\\d+>)?[a-zA-Z]?:[^:]*)|([^:]*))):)|())(?<SUBCATEGORY>(()|([^:]*? )))(?<CATEGORY>(error|warning))( \\s*(?<CODE>[^: ]*))?\\s*:(?<TEXT>.*)$", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
  {
      case 'E' or 'e':
          
-           if ((uint)slice.Length < 5 ||
+           if (slice.Length < 5 ||
              !slice.Slice(1).StartsWith("rror", StringComparison.OrdinalIgnoreCase)) // Match the string "rror" (ordinal case-insensitive)
          {
              goto CaptureBacktrack7;
          
      case 'W' or 'w':
          
-           if ((uint)slice.Length < 7 ||
+           if (slice.Length < 7 ||
              !slice.Slice(1).StartsWith("arning", StringComparison.OrdinalIgnoreCase)) // Match the string "arning" (ordinal case-insensitive)
          {
              goto CaptureBacktrack7;
"^\\s*(?<ORIGIN>(?<FILENAME>.*):(?<LOCATION>( ..." (7826 uses)
[GeneratedRegex("^\\s*(?<ORIGIN>(?<FILENAME>.*):(?<LOCATION>(?<LINE>[0-9]*):(?<COLUMN>[0-9]*))):(?<CATEGORY> error| warning):(?<TEXT>.*)", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
  // Match with 2 alternative expressions.
  //{
-       if ((uint)slice.Length < 2)
+       if (slice.Length < 2)
      {
          goto CaptureBacktrack1;
      }
      {
          case 'E' or 'e':
              
-               if ((uint)slice.Length < 6 ||
+               if (slice.Length < 6 ||
                  !slice.Slice(2).StartsWith("rror", StringComparison.OrdinalIgnoreCase)) // Match the string "rror" (ordinal case-insensitive)
              {
                  goto CaptureBacktrack1;
              
          case 'W' or 'w':
              
-               if ((uint)slice.Length < 8 ||
+               if (slice.Length < 8 ||
                  !slice.Slice(2).StartsWith("arning", StringComparison.OrdinalIgnoreCase)) // Match the string "arning" (ordinal case-insensitive)
              {
                  goto CaptureBacktrack1;
"^[A-Za-z]:" (5756 uses)
[GeneratedRegex("^[A-Za-z]:", RegexOptions.Compiled)]
      return false; // The input didn't match.
  }
  
-   if ((uint)slice.Length < 2 ||
+   if (slice.Length < 2 ||
      !char.IsAsciiLetter(slice[0]) || // Match a character in the set [A-Za-z].
      slice[1] != ':') // Match ':'.
  {
"[A-z-[dDfFiIoOqQuUwWzZ]]\\d[A-z-[dDfFiIoOqQu ..." (5703 uses)
[GeneratedRegex("[A-z-[dDfFiIoOqQuUwWzZ]]\\d[A-z-[dDfFiIoOqQuU]] *\\d[A-z-[dDfFiIoOqQuU]]\\d\\b", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.CultureInvariant)]
  char ch;
  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
  
-   if ((uint)slice.Length < 3 ||
+   if (slice.Length < 3 ||
      ((ch = slice[0]) < 128 ? ("\0\0\0\0綮ﭝ綯͝"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\u0004\0A{KÅ\0 \0DEFGIJOPQRUVWXZ[defgijopqruvwxz{")) || // Match a character in the set [A-z\u212A-[DFIOQUWZdfioquwz]].
      !char.IsDigit(slice[1]) || // Match a Unicode digit.
      ((ch = slice[2]) < 128 ? ("\0\0\0\0綮\uffdd綯ߝ"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\u0004\0A{KÅ\0\u0018\0DEFGIJOPQRUVdefgijopqruv"))) // Match a character in the set [A-z\u212A-[DFIOQUdfioqu]].
      pos += iteration;
  }
  
-   if ((uint)slice.Length < 6 ||
+   if (slice.Length < 6 ||
      !char.IsDigit(slice[3]) || // Match a Unicode digit.
      ((ch = slice[4]) < 128 ? ("\0\0\0\0綮\uffdd綯ߝ"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\u0004\0A{KÅ\0\u0018\0DEFGIJOPQRUVdefgijopqruv")) || // Match a character in the set [A-z\u212A-[DFIOQUdfioqu]].
      !char.IsDigit(slice[5])) // Match a Unicode digit.
"\\d{5}$|\\d{5}-\\d{4}$" (5703 uses)
[GeneratedRegex("\\d{5}$|\\d{5}-\\d{4}$", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.CultureInvariant)]
  // Match a Unicode digit exactly 5 times.
  {
-       if ((uint)slice.Length < 5 ||
+       if (slice.Length < 5 ||
          !char.IsDigit(slice[0]) ||
          !char.IsDigit(slice[1]) ||
          !char.IsDigit(slice[2]) ||
      
      // Branch 1
      {
-           if ((uint)slice.Length < 10 ||
+           if (slice.Length < 10 ||
              slice[5] != '-' || // Match '-'.
              !char.IsDigit(slice[6]) || // Match a Unicode digit exactly 4 times.
              !char.IsDigit(slice[7]) ||
"^[a-f0-9]{32}$" (4920 uses)
[GeneratedRegex("^[a-f0-9]{32}$", RegexOptions.Compiled)]
  // Match a character in the set [0-9a-f] exactly 32 times.
  {
-       if ((uint)slice.Length < 32)
+       if (slice.Length < 32)
      {
          return false; // The input didn't match.
      }

For more diff examples, see https://gist.github.com/MihuBot/74fd371c36382a07aa7fb19d08c970e7

Sample source code for further analysis
const string JsonPath = "RegexResults-613.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://runtimeutils.blob.core.windows.net/artifacts/3144cd93937a4aa0a575c59ed441faae/Results.zip");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream)!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public string[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

@MihuBot
Copy link
Owner Author

MihuBot commented Sep 2, 2024

@MihaZupan

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant