Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RegexDiff] MihaZupan/runtime/regex-fewerUintCasts #614

Open
MihuBot opened this issue Sep 2, 2024 · 1 comment
Open

[RegexDiff] MihaZupan/runtime/regex-fewerUintCasts #614

MihuBot opened this issue Sep 2, 2024 · 1 comment

Comments

@MihuBot
Copy link
Owner

MihuBot commented Sep 2, 2024

Job completed in 26 minutes 33 seconds.

Using arguments: regexdiff -NoPRLink

Examples of GeneratedRegex source diffs
"[^'\",]+'[^^']+'|[^'\",]+\"[^\"]+\"|[^,]+" (21563 uses)
[GeneratedRegex("[^'\",]+'[^^']+'|[^'\",]+\"[^\"]+\"|[^,]+", RegexOptions.None)]
  }
  
  // Match '\''.
-   if ((uint)slice.Length < 2 || slice[1] != '\'')
+   if (slice.Length < 2 || slice[1] != '\'')
  {
      goto AlternationBranch;
  }
  }
  
  // Match '"'.
-   if ((uint)slice.Length < 2 || slice[1] != '"')
+   if (slice.Length < 2 || slice[1] != '"')
  {
      goto AlternationBranch1;
  }
"(?:,\"|^\")(\"\"|[\\w\\W]*?)(?=\",|\"$)|(?:, ..." (18474 uses)
[GeneratedRegex("(?:,\"|^\")(\"\"|[\\w\\W]*?)(?=\",|\"$)|(?:,(?!\")|^(?!\"))([^,]*?)(?=$|,)|(\r\n|\n)", RegexOptions.Compiled)]
  // Branch 0
  {
      // Match ','.
-       if ((uint)slice.Length < 2 || slice[1] != ',')
+       if (slice.Length < 2 || slice[1] != ',')
      {
          goto AlternationBranch3;
      }
      }
      
      // Match '"'.
-       if ((uint)slice.Length < 2 || slice[1] != '"')
+       if (slice.Length < 2 || slice[1] != '"')
      {
          goto NegativeLookaroundMatch;
      }
  {
      case '\r':
          // Match '\n'.
-           if ((uint)slice.Length < 2 || slice[1] != '\n')
+           if (slice.Length < 2 || slice[1] != '\n')
          {
              UncaptureUntil(0);
              return false; // The input didn't match.
"^\\s*(((?<ORIGIN>(((\\d+>)?[a-zA-Z]?:[^:]*)| ..." (7826 uses)
[GeneratedRegex("^\\s*(((?<ORIGIN>(((\\d+>)?[a-zA-Z]?:[^:]*)|([^:]*))):)|())(?<SUBCATEGORY>(()|([^:]*? )))(?<CATEGORY>(error|warning))( \\s*(?<CODE>[^: ]*))?\\s*:(?<TEXT>.*)$", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
  {
      case 'E' or 'e':
          
-           if ((uint)slice.Length < 5 ||
+           if (slice.Length < 5 ||
              !slice.Slice(1).StartsWith("rror", StringComparison.OrdinalIgnoreCase)) // Match the string "rror" (ordinal case-insensitive)
          {
              goto CaptureBacktrack7;
          
      case 'W' or 'w':
          
-           if ((uint)slice.Length < 7 ||
+           if (slice.Length < 7 ||
              !slice.Slice(1).StartsWith("arning", StringComparison.OrdinalIgnoreCase)) // Match the string "arning" (ordinal case-insensitive)
          {
              goto CaptureBacktrack7;
"^\\s*(?<ORIGIN>(?<FILENAME>.*):(?<LOCATION>( ..." (7826 uses)
[GeneratedRegex("^\\s*(?<ORIGIN>(?<FILENAME>.*):(?<LOCATION>(?<LINE>[0-9]*):(?<COLUMN>[0-9]*))):(?<CATEGORY> error| warning):(?<TEXT>.*)", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
  // Match with 2 alternative expressions.
  //{
-       if ((uint)slice.Length < 2)
+       if (slice.Length < 2)
      {
          goto CaptureBacktrack1;
      }
      {
          case 'E' or 'e':
              
-               if ((uint)slice.Length < 6 ||
+               if (slice.Length < 6 ||
                  !slice.Slice(2).StartsWith("rror", StringComparison.OrdinalIgnoreCase)) // Match the string "rror" (ordinal case-insensitive)
              {
                  goto CaptureBacktrack1;
              
          case 'W' or 'w':
              
-               if ((uint)slice.Length < 8 ||
+               if (slice.Length < 8 ||
                  !slice.Slice(2).StartsWith("arning", StringComparison.OrdinalIgnoreCase)) // Match the string "arning" (ordinal case-insensitive)
              {
                  goto CaptureBacktrack1;
"^[A-Za-z]:" (5756 uses)
[GeneratedRegex("^[A-Za-z]:", RegexOptions.Compiled)]
      return false; // The input didn't match.
  }
  
-   if ((uint)slice.Length < 2 ||
+   if (slice.Length < 2 ||
      !char.IsAsciiLetter(slice[0]) || // Match a character in the set [A-Za-z].
      slice[1] != ':') // Match ':'.
  {
"[A-z-[dDfFiIoOqQuUwWzZ]]\\d[A-z-[dDfFiIoOqQu ..." (5703 uses)
[GeneratedRegex("[A-z-[dDfFiIoOqQuUwWzZ]]\\d[A-z-[dDfFiIoOqQuU]] *\\d[A-z-[dDfFiIoOqQuU]]\\d\\b", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.CultureInvariant)]
  char ch;
  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
  
-   if ((uint)slice.Length < 3 ||
+   if (slice.Length < 3 ||
      ((ch = slice[0]) < 128 ? ("\0\0\0\0綮ﭝ綯͝"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\u0004\0A{KÅ\0 \0DEFGIJOPQRUVWXZ[defgijopqruvwxz{")) || // Match a character in the set [A-z\u212A-[DFIOQUWZdfioquwz]].
      !char.IsDigit(slice[1]) || // Match a Unicode digit.
      ((ch = slice[2]) < 128 ? ("\0\0\0\0綮\uffdd綯ߝ"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\u0004\0A{KÅ\0\u0018\0DEFGIJOPQRUVdefgijopqruv"))) // Match a character in the set [A-z\u212A-[DFIOQUdfioqu]].
      pos += iteration;
  }
  
-   if ((uint)slice.Length < 6 ||
+   if (slice.Length < 6 ||
      !char.IsDigit(slice[3]) || // Match a Unicode digit.
      ((ch = slice[4]) < 128 ? ("\0\0\0\0綮\uffdd綯ߝ"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\u0004\0A{KÅ\0\u0018\0DEFGIJOPQRUVdefgijopqruv")) || // Match a character in the set [A-z\u212A-[DFIOQUdfioqu]].
      !char.IsDigit(slice[5])) // Match a Unicode digit.
"\\d{5}$|\\d{5}-\\d{4}$" (5703 uses)
[GeneratedRegex("\\d{5}$|\\d{5}-\\d{4}$", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline | RegexOptions.CultureInvariant)]
  // Match a Unicode digit exactly 5 times.
  {
-       if ((uint)slice.Length < 5 ||
+       if (slice.Length < 5 ||
          !char.IsDigit(slice[0]) ||
          !char.IsDigit(slice[1]) ||
          !char.IsDigit(slice[2]) ||
      
      // Branch 1
      {
-           if ((uint)slice.Length < 10 ||
+           if (slice.Length < 10 ||
              slice[5] != '-' || // Match '-'.
              !char.IsDigit(slice[6]) || // Match a Unicode digit exactly 4 times.
              !char.IsDigit(slice[7]) ||
"^[a-f0-9]{32}$" (4920 uses)
[GeneratedRegex("^[a-f0-9]{32}$", RegexOptions.Compiled)]
  // Match a character in the set [0-9a-f] exactly 32 times.
  {
-       if ((uint)slice.Length < 32)
+       if (slice.Length < 32)
      {
          return false; // The input didn't match.
      }
"^((([a-z]|\\d|[!#\\$%&'\\*\\+\\-\\/=\\?\\^_` ..." (4566 uses)
[GeneratedRegex("^((([a-z]|\\d|[!#\\$%&'\\*\\+\\-\\/=\\?\\^_`{\\|}~]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])+(\\.([a-z]|\\d|[!#\\$%&'\\*\\+\\-\\/=\\?\\^_`{\\|}~]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])+)*)|((\\x22)((((\\x20|\\x09)*(\\x0d\\x0a))?(\\x20|\\x09)+)?(([\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f]|\\x21|[\\x23-\\x5b]|[\\x5d-\\x7e]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])|(\\\\([\\x01-\\x09\\x0b\\x0c\\x0d-\\x7f]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF]))))*(((\\x20|\\x09)*(\\x0d\\x0a))?(\\x20|\\x09)+)?(\\x22)))@((([a-z]|\\d|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])|(([a-z]|\\d|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])([a-z]|\\d|-|\\.|_|~|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])*([a-z]|\\d|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])))\\.)+(([a-z]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])|(([a-z]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])([a-z]|\\d|-|\\.|_|~|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])*([a-z]|[\\u00A0-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFEF])))\\.?$", RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture | RegexOptions.Compiled)]
  // Branch 1
  //{
-       if ((uint)slice.Length < 2 ||
+       if (slice.Length < 2 ||
          slice[0] != '\\' || // Match '\\'.
          ((ch = slice[1]) < 128 ? ("ﯾ\uffff\uffff\uffff\uffff\uffff\uffff\uffff"[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, "\0\n\0\u0001\n\v\u0080 \ud800豈\ufdd0ﷰ\ufff0"))) // Match a character in the set [\u0001-\t\v-\u007F\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF].
      {
"xmlns=\"[^\"]*\"\\s*" (4166 uses)
[GeneratedRegex("xmlns=\"[^\"]*\"\\s*", RegexOptions.None)]
  }
  
  // Match '"'.
-   if ((uint)slice.Length < 8 || slice[7] != '"')
+   if (slice.Length < 8 || slice[7] != '"')
  {
      return false; // The input didn't match.
  }

For more diff examples, see https://gist.github.com/MihuBot/52cf0786819a2dbaf51ff3446e340fef

Sample source code for further analysis
const string JsonPath = "RegexResults-614.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://runtimeutils.blob.core.windows.net/artifacts/1bedf57df2a04ba7b09924fae92c1ea2/Results.zip");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream)!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public string[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

@MihuBot
Copy link
Owner Author

MihuBot commented Sep 2, 2024

@MihaZupan

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant