From 736c567e257dbe0fcf6ce81b4d6dbe05c6899f96 Mon Sep 17 00:00:00 2001 From: David Grudl Date: Thu, 2 May 2024 13:10:58 +0200 Subject: [PATCH] Strings::matchAll(): added option 'lazy' --- src/Utils/Strings.php | 32 +++++++-- tests/Utils/Strings.match().phpt | 5 +- tests/Utils/Strings.matchAll().phpt | 10 ++- tests/Utils/Strings.matchAll.lazy().phpt | 89 ++++++++++++++++++++++++ 4 files changed, 127 insertions(+), 9 deletions(-) create mode 100644 tests/Utils/Strings.matchAll.lazy().phpt diff --git a/src/Utils/Strings.php b/src/Utils/Strings.php index 1615b3e86..c07356595 100644 --- a/src/Utils/Strings.php +++ b/src/Utils/Strings.php @@ -589,6 +589,7 @@ public static function match( /** * Searches the string for all occurrences matching the regular expression and * returns an array of arrays containing the found expression and each subexpression. + * @return ($lazy is true ? \Generator : array[]) */ public static function matchAll( string $subject, @@ -599,21 +600,41 @@ public static function matchAll( bool $unmatchedAsNull = false, bool $patternOrder = false, bool $utf8 = false, - ): array + bool $lazy = false, + ): array|\Generator { - $flags = is_int($captureOffset) // back compatibility - ? $captureOffset - : ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0); - if ($utf8) { $offset = strlen(self::substring($subject, 0, $offset)); $pattern .= 'u'; } + if ($lazy) { + $flags = PREG_OFFSET_CAPTURE | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0); + return (function () use ($utf8, $captureOffset, $flags, $subject, $pattern, $offset) { + $counter = 0; + while ( + $offset <= strlen($subject) - ($counter ? 1 : 0) + && self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset]) + ) { + $offset = $m[0][1] + max(1, strlen($m[0][0])); + if (!$captureOffset) { + $m = array_map(fn($item) => $item[0], $m); + } elseif ($utf8) { + $m = self::bytesToChars($subject, [$m])[0]; + } + yield $counter++ => $m; + } + })(); + } + if ($offset > strlen($subject)) { return []; } + $flags = is_int($captureOffset) // back compatibility + ? $captureOffset + : ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0); + self::pcre('preg_match_all', [ $pattern, $subject, &$m, ($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER), @@ -622,7 +643,6 @@ public static function matchAll( return $utf8 && $captureOffset ? self::bytesToChars($subject, $m) : $m; - } diff --git a/tests/Utils/Strings.match().phpt b/tests/Utils/Strings.match().phpt index 26b9168ee..b3c9683cb 100644 --- a/tests/Utils/Strings.match().phpt +++ b/tests/Utils/Strings.match().phpt @@ -41,5 +41,6 @@ Assert::same([['k', 7]], Strings::match('žluťoučký kůň', '#[e-l]+#u', capt // right edge -Assert::null(Strings::match('hello world!', '', offset: 50)); -Assert::null(Strings::match('', '', offset: 1)); +Assert::same([''], Strings::match('he', '#(?<=e)#', offset: 2)); +Assert::same(null, Strings::match('he', '#(?<=x)#', offset: 2)); +Assert::same(null, Strings::match('he', '##', offset: 3)); diff --git a/tests/Utils/Strings.matchAll().phpt b/tests/Utils/Strings.matchAll().phpt index 844f16f27..8e73284b9 100644 --- a/tests/Utils/Strings.matchAll().phpt +++ b/tests/Utils/Strings.matchAll().phpt @@ -16,6 +16,12 @@ require __DIR__ . '/../bootstrap.php'; Assert::same([], Strings::matchAll('hello world!', '#([E-L])+#')); +// sentinel +Assert::same([ + [''], [''], [''], +], Strings::matchAll('he', '##')); + + // capturing Assert::same([ ['hell', 'l'], @@ -81,4 +87,6 @@ Assert::same([['e', null]], Strings::matchAll('hello world!', '#e(x)*#', unmatch // right edge -Assert::same([], Strings::matchAll('hello world!', '', offset: 50)); +Assert::same([['']], Strings::matchAll('he', '#(?<=e)#', offset: 2)); +Assert::same([], Strings::matchAll('he', '#(?<=x)#', offset: 2)); +Assert::same([], Strings::matchAll('he', '##', offset: 3)); diff --git a/tests/Utils/Strings.matchAll.lazy().phpt b/tests/Utils/Strings.matchAll.lazy().phpt new file mode 100644 index 000000000..753465b9e --- /dev/null +++ b/tests/Utils/Strings.matchAll.lazy().phpt @@ -0,0 +1,89 @@ +