Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limited regular expressions #2504

Open
wants to merge 81 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
19afa9d
updates
gregli-msft Jun 25, 2024
0016eb8
Updates
gregli-msft Jul 1, 2024
601de22
Updates
gregli-msft Jul 2, 2024
c36e21f
Updates
gregli-msft Jul 3, 2024
6347e59
Updates
gregli-msft Jul 3, 2024
a2ab49f
Update Microsoft.PowerFx.Core.Tests.Shared.projitems
gregli-msft Jul 3, 2024
20d2fd3
Update Microsoft.PowerFx.Core.Tests.Shared.projitems
gregli-msft Jul 3, 2024
765b291
Updates
gregli-msft Jul 3, 2024
b514426
Merge branch 'gregli/regex-min' of https://github.com/microsoft/power…
gregli-msft Jul 3, 2024
9b33931
Updates
gregli-msft Jul 3, 2024
b17c233
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 3, 2024
d5463c8
Updates
gregli-msft Jul 3, 2024
2316665
Updates
gregli-msft Jul 3, 2024
ba1e124
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 4, 2024
e2e58ff
Updates
gregli-msft Jul 4, 2024
6349dd9
Updates
gregli-msft Jul 4, 2024
ed49656
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 5, 2024
5105468
Updates
gregli-msft Jul 10, 2024
bbe578d
Updates
gregli-msft Jul 10, 2024
939bf18
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 10, 2024
1c72620
update
gregli-msft Jul 10, 2024
a88f286
Merge branch 'gregli/regex-min' of https://github.com/microsoft/Power…
gregli-msft Jul 10, 2024
428164d
Updates
gregli-msft Jul 10, 2024
d724b82
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 10, 2024
3693147
Updates
gregli-msft Jul 10, 2024
d628f1a
Merge branch 'gregli/regex-min' of https://github.com/microsoft/Power…
gregli-msft Jul 10, 2024
db1405d
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 30, 2024
b79704d
Updates
gregli-msft Aug 3, 2024
15541fc
Updates
gregli-msft Aug 12, 2024
0d0c88c
Updates
gregli-msft Aug 13, 2024
e18f2fe
Updates
gregli-msft Aug 13, 2024
a06654e
Updates
gregli-msft Aug 14, 2024
08d89e0
Updates
gregli-msft Aug 14, 2024
442bf73
updates
gregli-msft Aug 15, 2024
dfa1e15
Updates 2
gregli-msft Aug 15, 2024
9d7ba42
Updates 3
gregli-msft Aug 15, 2024
d51e047
Updates
gregli-msft Aug 16, 2024
e309223
Updates
gregli-msft Aug 22, 2024
ef24bec
Merge branch 'main' into gregli/regex-min
gregli-msft Aug 22, 2024
98ad8e4
Updates
gregli-msft Aug 22, 2024
c55a842
Updates
gregli-msft Aug 30, 2024
c5dfdb0
Merge branch 'main' into gregli/regex-min
gregli-msft Aug 30, 2024
e646a6f
Merge branch 'main' into gregli/regex-min
gregli-msft Aug 30, 2024
856bdfe
Updates
gregli-msft Aug 31, 2024
14d7fe5
Updates
gregli-msft Aug 31, 2024
78d1197
updates
gregli-msft Aug 31, 2024
59bed91
Merge branch 'main' into gregli/regex-min
gregli-msft Aug 31, 2024
3c6f50b
Updates
gregli-msft Aug 31, 2024
9a7e568
Updates
gregli-msft Aug 31, 2024
7bf1438
Updates
gregli-msft Aug 31, 2024
54aea5f
Updates
gregli-msft Sep 1, 2024
df24948
Updates
gregli-msft Sep 1, 2024
d506d23
Updates
gregli-msft Sep 3, 2024
47086b6
Updates
gregli-msft Sep 4, 2024
3904484
Updates
gregli-msft Sep 4, 2024
d7542de
Updates
gregli-msft Sep 4, 2024
273e37e
Updates
gregli-msft Sep 4, 2024
e674cf6
Updates
gregli-msft Sep 6, 2024
280beb2
updates
gregli-msft Sep 6, 2024
01d7098
Updates
gregli-msft Sep 8, 2024
3090dfe
Merge branch 'main' into gregli/regex-min
gregli-msft Sep 8, 2024
a6bb1ad
Updates
gregli-msft Sep 9, 2024
65cadd9
Merge branch 'main' into gregli/regex-min
gregli-msft Sep 20, 2024
d8774a1
Updates
gregli-msft Sep 20, 2024
14afc3b
Updates
gregli-msft Sep 20, 2024
fc1796b
Merge branch 'main' into gregli/regex-min
gregli-msft Sep 27, 2024
709dc9b
Merge branch 'main' into gregli/regex-min
gregli-msft Oct 18, 2024
5615760
updates
gregli-msft Oct 22, 2024
6b4499b
Updates
gregli-msft Oct 24, 2024
fa7edbe
updats
gregli-msft Oct 25, 2024
13ea69a
Updates
gregli-msft Oct 25, 2024
4a254e4
Updates
gregli-msft Oct 25, 2024
18dd7ef
Merge branch 'main' into gregli/regex-min
gregli-msft Oct 29, 2024
a7a9e43
Merge branch 'main' into gregli/regex-min
gregli-msft Nov 6, 2024
46f8221
Updates
gregli-msft Nov 6, 2024
c06bb98
Updates
gregli-msft Nov 7, 2024
4162e1f
update
gregli-msft Nov 7, 2024
03f5190
Updates
gregli-msft Nov 10, 2024
eae88e8
Updates
gregli-msft Nov 13, 2024
f9f9faa
Update
gregli-msft Nov 14, 2024
147ddc5
Update
gregli-msft Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/libraries/Microsoft.PowerFx.Core/Localization/Strings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,17 @@ internal static class TexlStrings
public static ErrorResourceKey ErrDecimalRequiresPowerFxV1 = new ErrorResourceKey("ErrDecimalNeedsPowerFxV1");

public static ErrorResourceKey ErrInvalidRegEx = new ErrorResourceKey("ErrInvalidRegEx");
public static ErrorResourceKey ErrInvalidRegExBadOptions = new ErrorResourceKey("ErrInvalidRegExBadOptions");
public static ErrorResourceKey ErrInvalidRegExBadOptionsNotAtFront = new ErrorResourceKey("ErrInvalidRegExBadOptionsNotAtFront");
public static ErrorResourceKey ErrInvalidRegExBadOctal = new ErrorResourceKey("ErrInvalidRegExBadOctal");
public static ErrorResourceKey ErrInvalidRegExBadBackRefSelfReferencing = new ErrorResourceKey("ErrInvalidRegExBadBackRefSelfReferencing");
public static ErrorResourceKey ErrInvalidRegExBadBackRefNotDefined = new ErrorResourceKey("ErrInvalidRegExBadBackRefNotDefined");
public static ErrorResourceKey ErrInvalidRegExBadBalancing = new ErrorResourceKey("ErrInvalidRegExBadBalancing");
public static ErrorResourceKey ErrInvalidRegExBadSingleQuoteNamedCapture = new ErrorResourceKey("ErrInvalidRegExBadSingleQuoteNamedCapture");
public static ErrorResourceKey ErrInvalidRegExBadEscape = new ErrorResourceKey("ErrInvalidRegExBadEscape");
public static ErrorResourceKey ErrInvalidRegExBadCharacterClassSubtraction = new ErrorResourceKey("ErrInvalidRegExBadCharacterClassSubtraction");
public static ErrorResourceKey ErrInvalidRegExBadConditional = new ErrorResourceKey("ErrInvalidRegExBadConditional");

public static ErrorResourceKey ErrVariableRegEx = new ErrorResourceKey("ErrVariableRegEx");
public static ErrorResourceKey InfoRegExCaptureNameHidesPredefinedFullMatchField = new ErrorResourceKey("InfoRegExCaptureNameHidesPredefinedFullMatchField");
public static ErrorResourceKey InfoRegExCaptureNameHidesPredefinedSubMatchesField = new ErrorResourceKey("InfoRegExCaptureNameHidesPredefinedSubMatchesField");
Expand Down
212 changes: 211 additions & 1 deletion src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/Match.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Globalization;
using System.Text.RegularExpressions;
using Microsoft.PowerFx.Core.App.ErrorContainers;
using Microsoft.PowerFx.Core.Binding;
Expand Down Expand Up @@ -87,7 +88,216 @@ public override bool CheckTypes(CheckTypesContext context, TexlNode[] args, DTyp
}

string regularExpression = nodeValue;
return fValid && TryCreateReturnType(regExNode, regularExpression, errors, ref returnType);
return fValid &&
(!context.Features.PowerFxV1CompatibilityRules || LimitRegularExpression(regExNode, regularExpression, errors)) &&
TryCreateReturnType(regExNode, regularExpression, errors, ref returnType);
}

// Limit regular expressions to common features that are supported, with conssitent semantics, by both canonical .NET and XRegExp.
// It is better to disallow now and bring back with customer demand or as platforms add more support.
//
// Features that are disallowed:
// Capture groups
// Self-referncing groups, such as "(a\1)".
// Treat all escaped number sequences as a backreference number.
// Single quoted "(?'name'..." and "\k'name'".
// Balancing capture groups.
// Octal character codes (use Hex or Unicode instead).
// "\o" could be added in the future, but we should avoid "\0" which causes backreference confusion.
// Inline options
// Anywhere in the expression except the beginning.
// For subexpressions.
// Character classes
// Character class subtraction "[a-z-[m-n]]".
// Conditional alternation
//
// Features that aren't supported by canonical .NET will be blocked automatically when the regular expression is instantiated in TryCreateReturnType.
//
// We chose to use canonical .NET instead of RegexOptions.ECMAScript because we wanted the unicode definitions for words.
// See https://learn.microsoft.com/dotnet/standard/base-types/regular-expression-options#ecmascript-matching-behavior for more details
private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IErrorContainer errors)
gregli-msft marked this conversation as resolved.
Show resolved Hide resolved
{
// Scans the regular expression for interesting constructs, ignoring other elements and constructs that are leagl, such as letters and numbers.
gregli-msft marked this conversation as resolved.
Show resolved Hide resolved
// Order of alternation is important. .NET regular expressions are greedy and will match the first of these that it can.
// Many subexpressions here take advantage of this, matching something that is valid, before falling through to check for something that is invalid.
//
// For example, consider testing "\\(\a)". This will match <goodEscape> <openCapture> <badEscapeAlpha> <closeCapture>.
// <badEscapeAlpha> will report an error and stop further processing.
// One might think that the "\a" could have matched <goodEscape>, but it will match <badEscapeAlpha> first because it is first in the RE.
// One might think that the "\(" could have matched <goodEscape>, but the double backslashes will be consumed first, which is why it is important
// to gather all the matches in a linear scan from the beginning to the end.
var groupPunctuationRE = new Regex(
@"
# leading backslash
\\(?<goodBackRefNum>[1-9]\d*) | # numeric backreference
\\k<(?<goodBackRefName>\w+)> | # named backreference
(?<badOctal>\\0[0-7]{0,3}) | # octal are not accepted (no XRegExp support, by design)
(?<goodEscapeAlpha>\\
([bBdDfnrsStvwW] | # standard regex character classes, missing from .NET are aAeGzZ (no XRegExp support), other common are u{} and o
[pP]\{\w+\} | # unicode character classes
c[a-zA-Z] | # Ctrl character classes
x[0-9a-fA-F]{2} | # hex character, must be exactly 2 hex digits
u[0-9a-fA-F]{4})) | # Unicode characters, must be exactly 4 hex digits
(?<badEscapeAlpha>\\[a-zA-Z_]) | # reserving all other letters and underscore for future use (consistent with .NET)
(?<goodEscape>\\.) | # any other escaped character is allowed, but must be paired so that '\\(' is seen as '\\' followed by '(' and not '\' folloed by '\('

# leading (?
\(\?<(?<goodNamedCapture>\w+)> | # named capture group
(?<goodNonCapture>\(\?:) | # non-capture group, still need to track to match with closing
(?<goodOptions>^\(\?[im]+\)) | # inline front of expression options we do support
(?<badOptions>\(\?(\w*-\w+|\w+)(:|\))?) | # inline options that we don't support, including disable of options (last ? portion makes for a better error message)
(?<badBalancing>\(\?(<|')\w*-\w+(>|')?) | # .NET balancing captures are not supported (last ? portion makes for a better error message)
(?<badSingleQuoteNamedCapture>\(\?'\w+'?) | # single quoted capture names are not supported (last ? portion makes for a better error message)
(?<badConditional>\(\?\() | # .NET conditional alternations are not supported

# basic open and close
(?<openCapture>\() |
(?<closeCapture>\)) |
(?<openCharacterClass>\[) |
(?<closeCharacterClass>\])
", RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);

var groupCounter = 0; // last group number defined
var groupNumStack = new Stack<int>(); // stack of group numbers, -1 is used for non capturing groups
var groupNameDict = new Dictionary<string, int>(); // mapping from group names to group numbers, membership means the name was defined

var openCharacterClass = false; // are we defining a character class?

foreach (Match groupMatch in groupPunctuationRE.Matches(regexPattern))
{
// ordered from most common/good to least common/bad, for fewer tests
if (groupMatch.Groups["goodEscape"].Success || groupMatch.Groups["goodEscapeAlpha"].Success || groupMatch.Groups["goodOptions"].Success)
{
// all is well, nothing to do
}
else if (groupMatch.Groups["openCharacterClass"].Success)
{
if (openCharacterClass)
{
// character class subtraction "[a-z-[m-n]]" is not supported
if (regexPattern[groupMatch.Groups["openCharacterClass"].Index - 1] == '-')
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadCharacterClassSubtraction);
return false;
}

// else ok, "[a[b]" is supported
}
else
{
openCharacterClass = true;
}
}
else if (groupMatch.Groups["closeCharacterClass"].Success)
{
// supports "[]]" which is valid but the closing square bracket must immediately follow the open
if (openCharacterClass && regexPattern[groupMatch.Groups["closeCharacterClass"].Index - 1] != '[')
{
openCharacterClass = false;
}
}
else if (groupMatch.Groups["openCapture"].Success || groupMatch.Groups["goodNonCapture"].Success || groupMatch.Groups["goodNamedCapture"].Success)
{
// parens do not need to be escaped within square brackets
if (!openCharacterClass)
{
// non capturing group still needs to match closing paren, but does not define a new group
groupNumStack.Push(groupMatch.Groups["goodNonCapture"].Success ? -1 : ++groupCounter);
if (groupMatch.Groups["goodNamedCapture"].Success)
{
groupNameDict.Add(groupMatch.Groups["goodNamedCapture"].Value, groupCounter);
}
}
}
else if (groupMatch.Groups["closeCapture"].Success)
{
// parens do not need to be escaped within square brackets
if (!openCharacterClass)
{
groupNumStack.Pop();
}
}
else if (groupMatch.Groups["goodBackRefNum"].Success)
{
var backRefNum = int.Parse(groupMatch.Groups["goodBackRefNum"].Value, CultureInfo.InvariantCulture);

// group isn't defined, or not defined yet
if (backRefNum > groupCounter)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadBackRefNotDefined, groupMatch.Value);
return false;
}

// group is not closed and thus self referencing
if (groupNumStack.Contains(backRefNum))
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadBackRefSelfReferencing, groupMatch.Value);
return false;
}
}
else if (groupMatch.Groups["goodBackRefName"].Success)
{
var backRefName = groupMatch.Groups["goodBackRefName"].Value;

// group isn't defined, or not defined yet
if (!groupNameDict.TryGetValue(backRefName, out var groupNum))
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadBackRefNotDefined, groupMatch.Value);
return false;
}

// group is not closed and thus self referencing
if (groupNumStack.Contains(groupNum))
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadBackRefSelfReferencing, groupMatch.Value);
return false;
}
}
else if (groupMatch.Groups["badOctal"].Success)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadOctal, groupMatch.Groups["badOctal"].Value);
return false;
}
else if (groupMatch.Groups["badBalancing"].Success)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadBalancing, groupMatch.Groups["badBalancing"].Value);
return false;
}
else if (groupMatch.Groups["badOptions"].Success)
{
if (groupMatch.Groups["badOptions"].Index > 0)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadOptionsNotAtFront, groupMatch.Groups["badOptions"].Value);
}
else
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadOptions, groupMatch.Groups["badOptions"].Value);
}

return false;
}
else if (groupMatch.Groups["badSingleQuoteNamedCapture"].Success)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadSingleQuoteNamedCapture, groupMatch.Groups["badSingleQuoteNamedCapture"].Value);
return false;
}
else if (groupMatch.Groups["badConditional"].Success)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadConditional, groupMatch.Groups["badConditional"].Value);
return false;
}
else if (groupMatch.Groups["badEscapeAlpha"].Success)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadEscape, groupMatch.Groups["badEscapeAlpha"].Value);
return false;
}
else
{
throw new NotImplementedException("Unknown regular expression match");
gregli-msft marked this conversation as resolved.
Show resolved Hide resolved
}
}

return true;
}

// Creates a typed result: [Match:s, Captures:*[Value:s], NamedCaptures:r[<namedCaptures>:s]]
Expand Down
Loading
Loading