Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limited regular expressions #2504

Open
wants to merge 81 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
19afa9d
updates
gregli-msft Jun 25, 2024
0016eb8
Updates
gregli-msft Jul 1, 2024
601de22
Updates
gregli-msft Jul 2, 2024
c36e21f
Updates
gregli-msft Jul 3, 2024
6347e59
Updates
gregli-msft Jul 3, 2024
a2ab49f
Update Microsoft.PowerFx.Core.Tests.Shared.projitems
gregli-msft Jul 3, 2024
20d2fd3
Update Microsoft.PowerFx.Core.Tests.Shared.projitems
gregli-msft Jul 3, 2024
765b291
Updates
gregli-msft Jul 3, 2024
b514426
Merge branch 'gregli/regex-min' of https://github.com/microsoft/power…
gregli-msft Jul 3, 2024
9b33931
Updates
gregli-msft Jul 3, 2024
b17c233
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 3, 2024
d5463c8
Updates
gregli-msft Jul 3, 2024
2316665
Updates
gregli-msft Jul 3, 2024
ba1e124
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 4, 2024
e2e58ff
Updates
gregli-msft Jul 4, 2024
6349dd9
Updates
gregli-msft Jul 4, 2024
ed49656
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 5, 2024
5105468
Updates
gregli-msft Jul 10, 2024
bbe578d
Updates
gregli-msft Jul 10, 2024
939bf18
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 10, 2024
1c72620
update
gregli-msft Jul 10, 2024
a88f286
Merge branch 'gregli/regex-min' of https://github.com/microsoft/Power…
gregli-msft Jul 10, 2024
428164d
Updates
gregli-msft Jul 10, 2024
d724b82
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 10, 2024
3693147
Updates
gregli-msft Jul 10, 2024
d628f1a
Merge branch 'gregli/regex-min' of https://github.com/microsoft/Power…
gregli-msft Jul 10, 2024
db1405d
Merge branch 'main' into gregli/regex-min
gregli-msft Jul 30, 2024
b79704d
Updates
gregli-msft Aug 3, 2024
15541fc
Updates
gregli-msft Aug 12, 2024
0d0c88c
Updates
gregli-msft Aug 13, 2024
e18f2fe
Updates
gregli-msft Aug 13, 2024
a06654e
Updates
gregli-msft Aug 14, 2024
08d89e0
Updates
gregli-msft Aug 14, 2024
442bf73
updates
gregli-msft Aug 15, 2024
dfa1e15
Updates 2
gregli-msft Aug 15, 2024
9d7ba42
Updates 3
gregli-msft Aug 15, 2024
d51e047
Updates
gregli-msft Aug 16, 2024
e309223
Updates
gregli-msft Aug 22, 2024
ef24bec
Merge branch 'main' into gregli/regex-min
gregli-msft Aug 22, 2024
98ad8e4
Updates
gregli-msft Aug 22, 2024
c55a842
Updates
gregli-msft Aug 30, 2024
c5dfdb0
Merge branch 'main' into gregli/regex-min
gregli-msft Aug 30, 2024
e646a6f
Merge branch 'main' into gregli/regex-min
gregli-msft Aug 30, 2024
856bdfe
Updates
gregli-msft Aug 31, 2024
14d7fe5
Updates
gregli-msft Aug 31, 2024
78d1197
updates
gregli-msft Aug 31, 2024
59bed91
Merge branch 'main' into gregli/regex-min
gregli-msft Aug 31, 2024
3c6f50b
Updates
gregli-msft Aug 31, 2024
9a7e568
Updates
gregli-msft Aug 31, 2024
7bf1438
Updates
gregli-msft Aug 31, 2024
54aea5f
Updates
gregli-msft Sep 1, 2024
df24948
Updates
gregli-msft Sep 1, 2024
d506d23
Updates
gregli-msft Sep 3, 2024
47086b6
Updates
gregli-msft Sep 4, 2024
3904484
Updates
gregli-msft Sep 4, 2024
d7542de
Updates
gregli-msft Sep 4, 2024
273e37e
Updates
gregli-msft Sep 4, 2024
e674cf6
Updates
gregli-msft Sep 6, 2024
280beb2
updates
gregli-msft Sep 6, 2024
01d7098
Updates
gregli-msft Sep 8, 2024
3090dfe
Merge branch 'main' into gregli/regex-min
gregli-msft Sep 8, 2024
a6bb1ad
Updates
gregli-msft Sep 9, 2024
65cadd9
Merge branch 'main' into gregli/regex-min
gregli-msft Sep 20, 2024
d8774a1
Updates
gregli-msft Sep 20, 2024
14afc3b
Updates
gregli-msft Sep 20, 2024
fc1796b
Merge branch 'main' into gregli/regex-min
gregli-msft Sep 27, 2024
709dc9b
Merge branch 'main' into gregli/regex-min
gregli-msft Oct 18, 2024
5615760
updates
gregli-msft Oct 22, 2024
6b4499b
Updates
gregli-msft Oct 24, 2024
fa7edbe
updats
gregli-msft Oct 25, 2024
13ea69a
Updates
gregli-msft Oct 25, 2024
4a254e4
Updates
gregli-msft Oct 25, 2024
18dd7ef
Merge branch 'main' into gregli/regex-min
gregli-msft Oct 29, 2024
a7a9e43
Merge branch 'main' into gregli/regex-min
gregli-msft Nov 6, 2024
46f8221
Updates
gregli-msft Nov 6, 2024
c06bb98
Updates
gregli-msft Nov 7, 2024
4162e1f
update
gregli-msft Nov 7, 2024
03f5190
Updates
gregli-msft Nov 10, 2024
eae88e8
Updates
gregli-msft Nov 13, 2024
f9f9faa
Update
gregli-msft Nov 14, 2024
147ddc5
Update
gregli-msft Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,12 @@ internal static class TexlStrings
public static ErrorResourceKey ErrInvalidRegExBadSingleQuoteNamedCapture = new ErrorResourceKey("ErrInvalidRegExBadSingleQuoteNamedCapture");
public static ErrorResourceKey ErrInvalidRegExBadEscape = new ErrorResourceKey("ErrInvalidRegExBadEscape");
public static ErrorResourceKey ErrInvalidRegExBadCharacterClassSubtraction = new ErrorResourceKey("ErrInvalidRegExBadCharacterClassSubtraction");
public static ErrorResourceKey ErrInvalidRegExBadConditional = new ErrorResourceKey("ErrInvalidRegExBadConditional");
public static ErrorResourceKey ErrInvalidRegExBadConditional = new ErrorResourceKey("ErrInvalidRegExBadConditional");
public static ErrorResourceKey ErrInvalidRegExBadBackRefUseNameInsteadOfNum = new ErrorResourceKey("ErrInvalidRegExBadBackRefUseNameInsteadOfNum");
public static ErrorResourceKey ErrInvalidRegExBadBackRefNumberForName = new ErrorResourceKey("ErrInvalidRegExBadBackRefNumberForName");
public static ErrorResourceKey ErrInvalidRegExBadNamedCaptureAlreadyExists = new ErrorResourceKey("ErrInvalidRegExBadNamedCaptureAlreadyExists");
public static ErrorResourceKey ErrInvalidRegExBadNamedCaptureName = new ErrorResourceKey("ErrInvalidRegExBadNamedCaptureName");
public static ErrorResourceKey ErrInvalidRegExBadCharacterClassLiteralSquareBracket = new ErrorResourceKey("ErrInvalidRegExBadCharacterClassLiteralSquareBracket");

public static ErrorResourceKey ErrVariableRegEx = new ErrorResourceKey("ErrVariableRegEx");
public static ErrorResourceKey InfoRegExCaptureNameHidesPredefinedFullMatchField = new ErrorResourceKey("InfoRegExCaptureNameHidesPredefinedFullMatchField");
Expand Down
128 changes: 83 additions & 45 deletions src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/Match.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,35 +89,38 @@ public override bool CheckTypes(CheckTypesContext context, TexlNode[] args, DTyp

string regularExpression = nodeValue;
return fValid &&
(!context.Features.PowerFxV1CompatibilityRules || LimitRegularExpression(regExNode, regularExpression, errors)) &&
(!context.Features.PowerFxV1CompatibilityRules || IsSupportedRegularExpression(regExNode, regularExpression, errors)) &&
TryCreateReturnType(regExNode, regularExpression, errors, ref returnType);
}

// Limit regular expressions to common features that are supported, with conssitent semantics, by both canonical .NET and XRegExp.
// Limit regular expressions to common features that are supported, with consistent semantics, by both canonical .NET and XRegExp.
// It is better to disallow now and bring back with customer demand or as platforms add more support.
//
// Features that are disallowed:
// Capture groups
// Self-referncing groups, such as "(a\1)".
// Treat all escaped number sequences as a backreference number.
// Single quoted "(?'name'..." and "\k'name'".
// Balancing capture groups.
// Octal character codes (use Hex or Unicode instead).
// Self-referncing groups, such as "(a\1)" (.NET different from XRegExp).
// Treat all escaped number sequences as a backreference number (.NET different from XRegExp).
// Single quoted "(?'name'..." and "\k'name'" (.NET only).
// Balancing capture groups (.NET only).
// Using named captures with back reference \number (.NET different from XRegExp).
// Using \k<number> notation for numeric back references (.NET different from XRegExp).
// Octal character codes (.NET different from XRegExp)
// Uuse Hex or Unicode instead.
// "\o" could be added in the future, but we should avoid "\0" which causes backreference confusion.
// Inline options
// Anywhere in the expression except the beginning.
// For subexpressions.
// Anywhere in the expression except the beginning (.NET only).
// For subexpressions (.NET only).
// Character classes
// Character class subtraction "[a-z-[m-n]]".
// Conditional alternation
// Character class subtraction "[a-z-[m-n]]" (.NET only).
// Conditional alternation (.NET only).
//
// Features that aren't supported by canonical .NET will be blocked automatically when the regular expression is instantiated in TryCreateReturnType.
//
// We chose to use canonical .NET instead of RegexOptions.ECMAScript because we wanted the unicode definitions for words.
// See https://learn.microsoft.com/dotnet/standard/base-types/regular-expression-options#ecmascript-matching-behavior for more details
private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IErrorContainer errors)
private bool IsSupportedRegularExpression(TexlNode regExNode, string regexPattern, IErrorContainer errors)
{
// Scans the regular expression for interesting constructs, ignoring other elements and constructs that are leagl, such as letters and numbers.
// Scans the regular expression for interesting constructs, ignoring other elements and constructs that are legal, such as letters and numbers.
// Order of alternation is important. .NET regular expressions are greedy and will match the first of these that it can.
// Many subexpressions here take advantage of this, matching something that is valid, before falling through to check for something that is invalid.
//
Expand All @@ -128,38 +131,42 @@ private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IEr
// to gather all the matches in a linear scan from the beginning to the end.
var groupPunctuationRE = new Regex(
@"
# leading backslash
\\(?<goodBackRefNum>[1-9]\d*) | # numeric backreference
\\k<(?<goodBackRefName>\w+)> | # named backreference
(?<badOctal>\\0[0-7]{0,3}) | # octal are not accepted (no XRegExp support, by design)
# leading backslash, escape sequences
\\(?<goodBackRefNum>[1-9]\d*) | # numeric backreference
\\k<(?<goodBackRefName>\w+)> | # named backreference
(?<badOctal>\\0[0-7]{0,3}) | # octal are not accepted (no XRegExp support, by design)
(?<goodEscapeAlpha>\\
([bBdDfnrsStvwW] | # standard regex character classes, missing from .NET are aAeGzZ (no XRegExp support), other common are u{} and o
[pP]\{\w+\} | # unicode character classes
c[a-zA-Z] | # Ctrl character classes
x[0-9a-fA-F]{2} | # hex character, must be exactly 2 hex digits
u[0-9a-fA-F]{4})) | # Unicode characters, must be exactly 4 hex digits
(?<badEscapeAlpha>\\[a-zA-Z_]) | # reserving all other letters and underscore for future use (consistent with .NET)
(?<goodEscape>\\.) | # any other escaped character is allowed, but must be paired so that '\\(' is seen as '\\' followed by '(' and not '\' folloed by '\('
([bBdDfnrsStvwW] | # standard regex character classes, missing from .NET are aAeGzZ (no XRegExp support), other common are u{} and o
[pP]\{\w+\} | # unicode character classes
c[a-zA-Z] | # Ctrl character classes
x[0-9a-fA-F]{2} | # hex character, must be exactly 2 hex digits
u[0-9a-fA-F]{4})) | # Unicode characters, must be exactly 4 hex digits
(?<badEscapeAlpha>\\[a-zA-Z_]) | # reserving all other letters and underscore for future use (consistent with .NET)
(?<goodEscape>\\.) | # any other escaped character is allowed, but must be paired so that '\\(' is seen as '\\' followed by '(' and not '\' folloed by '\('

# leading (?<, named captures
\(\?<(?<goodNamedCapture>[a-zA-Z][a-zA-Z\d]*)> | # named capture group, can only be letters and numbers and must start with a letter
(?<badBalancing>\(\?<\w*-\w*>) | # .NET balancing captures are not supported
(?<badNamedCaptureName>\(\?<[^>]*>) | # bad named capture name, didn't match goodNamedCapture
(?<badSingleQuoteNamedCapture>\(\?'[^']*') | # single quoted capture names are not supported

# leading (?
\(\?<(?<goodNamedCapture>\w+)> | # named capture group
(?<goodNonCapture>\(\?:) | # non-capture group, still need to track to match with closing
(?<goodOptions>^\(\?[im]+\)) | # inline front of expression options we do support
(?<badOptions>\(\?(\w*-\w+|\w+)(:|\))?) | # inline options that we don't support, including disable of options (last ? portion makes for a better error message)
(?<badBalancing>\(\?(<|')\w*-\w+(>|')?) | # .NET balancing captures are not supported (last ? portion makes for a better error message)
(?<badSingleQuoteNamedCapture>\(\?'\w+'?) | # single quoted capture names are not supported (last ? portion makes for a better error message)
(?<badConditional>\(\?\() | # .NET conditional alternations are not supported
(?<goodNonCapture>\(\?:) | # non-capture group, still need to track to match with closing
(?<goodOptions>^\(\?[im]+\)) | # inline front of expression options we do support
(?<badOptions>\(\?(\w*-\w*|\w+)(:|\))?) | # inline options that we don't support, including disable of options (last ? portion makes for a better error message)
(?<badConditional>\(\?\() | # .NET conditional alternations are not supported

# basic open and close
(?<openCapture>\() |
(?<closeCapture>\)) |
(?<openCharacterClass>\[) |
(?<badCharacterClassEmpty>\[\]) | # disallow empty chararcter class (supported by XRegExp) and literal ] at front of character class (supported by .NET)
(?<openCapture>\() |
(?<closeCapture>\)) |
(?<openCharacterClass>\[) |
(?<closeCharacterClass>\])
", RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);

var groupCounter = 0; // last group number defined
var groupNumStack = new Stack<int>(); // stack of group numbers, -1 is used for non capturing groups
var groupNameDict = new Dictionary<string, int>(); // mapping from group names to group numbers, membership means the name was defined
var groupNumStack = new Stack<int>(); // stack of open group numbers, -1 is used for non capturing groups
var groupNameDict = new Dictionary<string, int>(); // mapping from group names to group numbers

var openCharacterClass = false; // are we defining a character class?

Expand All @@ -180,8 +187,11 @@ private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IEr
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadCharacterClassSubtraction);
return false;
}

// else ok, "[a[b]" is supported
else
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadCharacterClassLiteralSquareBracket);
return false;
}
}
else
{
Expand All @@ -190,11 +200,7 @@ private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IEr
}
else if (groupMatch.Groups["closeCharacterClass"].Success)
{
// supports "[]]" which is valid but the closing square bracket must immediately follow the open
if (openCharacterClass && regexPattern[groupMatch.Groups["closeCharacterClass"].Index - 1] != '[')
{
openCharacterClass = false;
}
openCharacterClass = false;
}
else if (groupMatch.Groups["openCapture"].Success || groupMatch.Groups["goodNonCapture"].Success || groupMatch.Groups["goodNamedCapture"].Success)
{
Expand All @@ -205,6 +211,12 @@ private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IEr
groupNumStack.Push(groupMatch.Groups["goodNonCapture"].Success ? -1 : ++groupCounter);
if (groupMatch.Groups["goodNamedCapture"].Success)
{
if (groupNameDict.ContainsKey(groupMatch.Groups["goodNamedCapture"].Value))
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadNamedCaptureAlreadyExists, groupMatch.Value);
return false;
}

groupNameDict.Add(groupMatch.Groups["goodNamedCapture"].Value, groupCounter);
}
}
Expand All @@ -228,6 +240,13 @@ private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IEr
return false;
}

// group has a name, use that instead
if (groupNameDict.ContainsValue(backRefNum))
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadBackRefUseNameInsteadOfNum, groupMatch.Value);
return false;
}

// group is not closed and thus self referencing
if (groupNumStack.Contains(backRefNum))
{
Expand All @@ -242,8 +261,16 @@ private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IEr
// group isn't defined, or not defined yet
if (!groupNameDict.TryGetValue(backRefName, out var groupNum))
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadBackRefNotDefined, groupMatch.Value);
return false;
if (int.TryParse(backRefName, out groupNum))
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadBackRefNumberForName, groupMatch.Value);
return false;
}
else
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadBackRefNotDefined, groupMatch.Value);
return false;
}
}

// group is not closed and thus self referencing
Expand All @@ -253,6 +280,11 @@ private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IEr
return false;
}
}
else if (groupMatch.Groups["badNamedCaptureName"].Success)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadNamedCaptureName, groupMatch.Groups["badNamedCaptureName"].Value);
return false;
}
else if (groupMatch.Groups["badOctal"].Success)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadOctal, groupMatch.Groups["badOctal"].Value);
Expand Down Expand Up @@ -286,13 +318,19 @@ private bool LimitRegularExpression(TexlNode regExNode, string regexPattern, IEr
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadConditional, groupMatch.Groups["badConditional"].Value);
return false;
}
else if (groupMatch.Groups["badCharacterClassEmpty"].Success)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadCharacterClassLiteralSquareBracket, groupMatch.Groups["badCharacterClassEmpty"].Value);
return false;
}
else if (groupMatch.Groups["badEscapeAlpha"].Success)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExBadEscape, groupMatch.Groups["badEscapeAlpha"].Value);
return false;
}
else
{
// This should never be hit. Good to have here in case one of the group names checked doesn't match the RE, running tests would hit this.
throw new NotImplementedException("Unknown regular expression match");
gregli-msft marked this conversation as resolved.
Show resolved Hide resolved
}
}
Expand Down
Loading
Loading