Skip to content

Commit 522a8b0

Browse files
staabmondrejmirtes
authored andcommitted
Fix regex parsing by completing the grammar
- Skip regex parsing if the pattern is not valid, but report errors then if parsing fails - Fix support for internal options to include more options, and fix []] (i.e. class end delimiter as first character is not a literal) not parsing correctly - Add a couple missing quantifiers in getQuantificationRange - Handle more literal values and fix escaped ones to be unescaped
1 parent 831f4cd commit 522a8b0

File tree

5 files changed

+177
-48
lines changed

5 files changed

+177
-48
lines changed

resources/RegexGrammar.pp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,23 @@
4646
%skip nl \n
4747

4848
// Character classes.
49-
%token negative_class_ \[\^
50-
%token class_ \[
51-
%token _class \]
52-
%token range \-
49+
%token negative_class_ \[\^ -> class
50+
%token class_ \[ -> class
51+
%token class:posix_class \[:\^?[a-z]+:\]
52+
%token class:class_ \[
53+
%token class:_class_literal (?<=[^\\]\[|[^\\]\[\^)\]
54+
%token class:_class \] -> default
55+
%token class:range \-
56+
%token class:escaped_end_class \\\]
57+
// taken over from literals but class:character has \b support on top (backspace in character classes)
58+
%token class:character \\([aefnrtb]|c[\x00-\x7f])
59+
%token class:dynamic_character \\([0-7]{3}|x[0-9a-zA-Z]{2}|x{[0-9a-zA-Z]+})
60+
%token class:character_type \\([CdDhHNRsSvVwWX]|[pP]{[^}]+})
61+
%token class:literal \\.|.
5362

5463
// Internal options.
55-
%token internal_option \(\?[\-+]?[imsx]\)
64+
// See https://www.regular-expressions.info/refmodifiers.html
65+
%token internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx)\)
5666

5767
// Lookahead and lookbehind assertions.
5868
%token lookahead_ \(\?=
@@ -77,6 +87,7 @@
7787
%token nc:_named_capturing > -> default
7888
%token nc:capturing_name .+?(?=(?<!\\)>)
7989
%token non_capturing_ \(\?:
90+
%token non_capturing_internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx):
8091
%token non_capturing_reset_ \(\?\|
8192
%token atomic_group_ \(\?>
8293
%token capturing_ \(
@@ -168,7 +179,7 @@
168179
::negative_class_:: #negativeclass
169180
| ::class_::
170181
)
171-
( <class_> | range() | literal() )+
182+
( <range> | <_class_literal> )? ( <posix_class> | <class_> | range() | literal() | <escaped_end_class> )* <range>?
172183
::_class::
173184

174185
#range:
@@ -183,15 +194,18 @@
183194
| (
184195
::named_capturing_:: <capturing_name> ::_named_capturing:: #namedcapturing
185196
| ::non_capturing_:: #noncapturing
197+
| non_capturing_internal_options() #noncapturing
186198
| ::non_capturing_reset_:: #noncapturingreset
187199
| ::atomic_group_:: #atomicgroup
188200
| ::capturing_::
189201
)
190202
alternation() ::_capturing::
191203

204+
non_capturing_internal_options:
205+
<non_capturing_internal_option>
206+
192207
literal:
193208
<character>
194-
| <range>
195209
| <dynamic_character>
196210
| <character_type>
197211
| <anchor>

src/Type/Php/RegexArrayShapeMatcher.php

Lines changed: 89 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
use Hoa\Compiler\Llk\TreeNode;
88
use Hoa\Exception\Exception;
99
use Hoa\File\Read;
10+
use Nette\Utils\RegexpException;
1011
use Nette\Utils\Strings;
1112
use PhpParser\Node\Expr;
1213
use PhpParser\Node\Name;
@@ -31,7 +32,11 @@
3132
use function in_array;
3233
use function is_int;
3334
use function is_string;
35+
use function rtrim;
3436
use function sscanf;
37+
use function str_replace;
38+
use function strlen;
39+
use function substr;
3540
use const PREG_OFFSET_CAPTURE;
3641
use const PREG_UNMATCHED_AS_NULL;
3742

@@ -375,6 +380,13 @@ private function parseGroups(string $regex): ?array
375380
self::$parser = Llk::load(new Read(__DIR__ . '/../../../resources/RegexGrammar.pp'));
376381
}
377382

383+
try {
384+
Strings::match('', $regex);
385+
} catch (RegexpException) {
386+
// pattern is invalid, so let the RegularExpressionPatternRule report it
387+
return null;
388+
}
389+
378390
try {
379391
$ast = self::$parser->parse($regex);
380392
} catch (Exception) {
@@ -516,25 +528,37 @@ private function getQuantificationRange(TreeNode $node): array
516528
$lastChild = $node->getChild($node->getChildrenNumber() - 1);
517529
$value = $lastChild->getValue();
518530

519-
if ($value['token'] === 'n_to_m') {
520-
if (sscanf($value['value'], '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) {
531+
// normalize away possessive and lazy quantifier-modifiers
532+
$token = str_replace(['_possessive', '_lazy'], '', $value['token']);
533+
$value = rtrim($value['value'], '+?');
534+
535+
if ($token === 'n_to_m') {
536+
if (sscanf($value, '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) {
521537
throw new ShouldNotHappenException();
522538
}
523539

524540
$min = $n;
525541
$max = $m;
526-
} elseif ($value['token'] === 'exactly_n') {
527-
if (sscanf($value['value'], '{%d}', $n) !== 1 || !is_int($n)) {
542+
} elseif ($token === 'n_or_more') {
543+
if (sscanf($value, '{%d,}', $n) !== 1 || !is_int($n)) {
544+
throw new ShouldNotHappenException();
545+
}
546+
547+
$min = $n;
548+
} elseif ($token === 'exactly_n') {
549+
if (sscanf($value, '{%d}', $n) !== 1 || !is_int($n)) {
528550
throw new ShouldNotHappenException();
529551
}
530552

531553
$min = $n;
532554
$max = $n;
533-
} elseif ($value['token'] === 'zero_or_one') {
555+
} elseif ($token === 'zero_or_one') {
534556
$min = 0;
535557
$max = 1;
536-
} elseif ($value['token'] === 'zero_or_more') {
558+
} elseif ($token === 'zero_or_more') {
537559
$min = 0;
560+
} elseif ($token === 'one_or_more') {
561+
$min = 1;
538562
}
539563

540564
return [$min, $max];
@@ -591,20 +615,8 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
591615
if ($literalValue !== null) {
592616
if (Strings::match($literalValue, '/^\d+$/') === null) {
593617
$isNumeric = TrinaryLogic::createNo();
594-
}
595-
596-
if (!$inOptionalQuantification) {
597-
$isNonEmpty = TrinaryLogic::createYes();
598-
}
599-
}
600-
601-
if ($ast->getValueToken() === 'character_type') {
602-
if ($ast->getValueValue() === '\d') {
603-
if ($isNumeric->maybe()) {
604-
$isNumeric = TrinaryLogic::createYes();
605-
}
606-
} else {
607-
$isNumeric = TrinaryLogic::createNo();
618+
} elseif ($isNumeric->maybe()) {
619+
$isNumeric = TrinaryLogic::createYes();
608620
}
609621

610622
if (!$inOptionalQuantification) {
@@ -624,7 +636,6 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
624636
}
625637

626638
if (Strings::match($literalValue, '/^\d+$/') === null) {
627-
$allNumeric = false;
628639
break;
629640
}
630641

@@ -653,13 +664,65 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
653664

654665
private function getLiteralValue(TreeNode $node): ?string
655666
{
656-
if ($node->getId() === 'token' && $node->getValueToken() === 'literal') {
657-
return $node->getValueValue();
667+
if ($node->getId() !== 'token') {
668+
return null;
669+
}
670+
671+
// token is the token name from grammar without the namespace so literal and class:literal are both called literal here
672+
$token = $node->getValueToken();
673+
$value = $node->getValueValue();
674+
675+
if (in_array($token, ['literal', 'escaped_end_class'], true)) {
676+
if (strlen($node->getValueValue()) > 1 && $value[0] === '\\') {
677+
return substr($value, 1);
678+
}
679+
680+
return $value;
681+
}
682+
683+
// literal "-" in front/back of a character class like '[-a-z]' or '[abc-]', not forming a range
684+
if ($token === 'range') {
685+
return $value;
686+
}
687+
688+
// literal "[" or "]" inside character classes '[[]' or '[]]'
689+
if (in_array($token, ['class_', '_class_literal'], true)) {
690+
return $value;
691+
}
692+
693+
// character escape sequences, just return a fixed string
694+
if (in_array($token, ['character', 'dynamic_character', 'character_type'], true)) {
695+
if ($token === 'character_type' && $value === '\d') {
696+
return '0';
697+
}
698+
699+
return $value;
700+
}
701+
702+
// [:digit:] and the like, more support coming later
703+
if ($token === 'posix_class') {
704+
if ($value === '[:digit:]') {
705+
return '0';
706+
}
707+
if (in_array($value, ['[:alpha:]', '[:alnum:]', '[:upper:]', '[:lower:]', '[:word:]', '[:ascii:]', '[:print:]', '[:xdigit:]', '[:graph:]'], true)) {
708+
return 'a';
709+
}
710+
if ($value === '[:blank:]') {
711+
return " \t";
712+
}
713+
if ($value === '[:cntrl:]') {
714+
return "\x00\x1F";
715+
}
716+
if ($value === '[:space:]') {
717+
return " \t\r\n\v\f";
718+
}
719+
if ($value === '[:punct:]') {
720+
return '!"#$%&\'()*+,\-./:;<=>?@[\]^_`{|}~';
721+
}
658722
}
659723

660-
// literal "-" outside of a character class like '~^((\\d{1,6})-)$~'
661-
if ($node->getId() === 'token' && $node->getValueToken() === 'range') {
662-
return $node->getValueValue();
724+
if ($token === 'anchor' || $token === 'match_point_reset') {
725+
return '';
663726
}
664727

665728
return null;

tests/PHPStan/Analyser/LegacyNodeScopeResolverTest.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8003,7 +8003,7 @@ public function dataPassedByReference(): array
80038003
'$arr',
80048004
],
80058005
[
8006-
'array{0?: string}',
8006+
'array<string>',
80078007
'$matches',
80088008
],
80098009
[

tests/PHPStan/Analyser/nsrt/preg_match_shapes.php

Lines changed: 58 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -127,28 +127,18 @@ function doUnknownFlags(string $s, int $flags): void {
127127
assertType('array<array{string|null, int<-1, max>}|string|null>', $matches);
128128
}
129129

130-
function doNonAutoCapturingModifier(string $s): void {
131-
if (preg_match('/(?n)(\d+)/', $s, $matches)) {
132-
// could be assertType('array{string}', $matches);
133-
assertType('array<string>', $matches);
134-
}
135-
assertType('array<string>', $matches);
136-
}
137-
138130
function doMultipleAlternativeCaptureGroupsWithSameNameWithModifier(string $s): void {
139131
if (preg_match('/(?J)(?<Foo>[a-z]+)|(?<Foo>[0-9]+)/', $s, $matches)) {
140-
// could be assertType('array{0: string, Foo: string, 1: string}', $matches);
141-
assertType('array<string>', $matches);
132+
assertType('array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
142133
}
143-
assertType('array<string>', $matches);
134+
assertType('array{}|array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
144135
}
145136

146137
function doMultipleConsecutiveCaptureGroupsWithSameNameWithModifier(string $s): void {
147138
if (preg_match('/(?J)(?<Foo>[a-z]+)|(?<Foo>[0-9]+)/', $s, $matches)) {
148-
// could be assertType('array{0: string, Foo: string, 1: string}', $matches);
149-
assertType('array<string>', $matches);
139+
assertType('array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
150140
}
151-
assertType('array<string>', $matches);
141+
assertType('array{}|array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
152142
}
153143

154144
// https://github.com/hoaproject/Regex/issues/31
@@ -472,3 +462,57 @@ function (string $s): void {
472462
assertType("array{string, non-empty-string}", $matches);
473463
}
474464
};
465+
466+
function bug11323(string $s): void {
467+
if (preg_match('/([*|+?{}()]+)([^*|+[:digit:]?{}()]+)/', $s, $matches)) {
468+
assertType('array{string, non-empty-string, non-empty-string}', $matches);
469+
}
470+
if (preg_match('/\p{L}[[\]]+([-*|+?{}(?:)]+)([^*|+[:digit:]?{a-z}(\p{L})\a-]+)/', $s, $matches)) {
471+
assertType('array{string, non-empty-string, non-empty-string}', $matches);
472+
}
473+
if (preg_match('{([-\p{L}[\]*|\x03\a\b+?{}(?:)-]+[^[:digit:]?{}a-z0-9#-k]+)(a-z)}', $s, $matches)) {
474+
assertType('array{string, non-empty-string, non-empty-string}', $matches);
475+
}
476+
if (preg_match('{(\d+)(?i)insensitive((?x-i)case SENSITIVE here(?i:insensitive non-capturing group))}', $s, $matches)) {
477+
assertType('array{string, numeric-string, non-empty-string}', $matches);
478+
}
479+
if (preg_match('{([]] [^]])}', $s, $matches)) {
480+
assertType('array{string, non-empty-string}', $matches);
481+
}
482+
if (preg_match('{([[:digit:]])}', $s, $matches)) {
483+
assertType('array{string, numeric-string}', $matches);
484+
}
485+
if (preg_match('{([\d])(\d)}', $s, $matches)) {
486+
assertType('array{string, numeric-string, numeric-string}', $matches);
487+
}
488+
if (preg_match('{([0-9])}', $s, $matches)) {
489+
assertType('array{string, numeric-string}', $matches);
490+
}
491+
if (preg_match('{(\p{L})(\p{P})(\p{Po})}', $s, $matches)) {
492+
assertType('array{string, non-empty-string, non-empty-string, non-empty-string}', $matches);
493+
}
494+
if (preg_match('{(a)??(b)*+(c++)(d)+?}', $s, $matches)) {
495+
assertType('array{string, string, string, non-empty-string, non-empty-string}', $matches);
496+
}
497+
if (preg_match('{(.\d)}', $s, $matches)) {
498+
assertType('array{string, non-empty-string}', $matches);
499+
}
500+
if (preg_match('{(\d.)}', $s, $matches)) {
501+
assertType('array{string, non-empty-string}', $matches);
502+
}
503+
if (preg_match('{(\d\d)}', $s, $matches)) {
504+
assertType('array{string, numeric-string}', $matches);
505+
}
506+
if (preg_match('{(.(\d))}', $s, $matches)) {
507+
assertType('array{string, non-empty-string, numeric-string}', $matches);
508+
}
509+
if (preg_match('{((\d).)}', $s, $matches)) {
510+
assertType('array{string, non-empty-string, numeric-string}', $matches);
511+
}
512+
if (preg_match('{(\d([1-4])\d)}', $s, $matches)) {
513+
assertType('array{string, numeric-string, numeric-string}', $matches);
514+
}
515+
if (preg_match('{(x?([1-4])\d)}', $s, $matches)) {
516+
assertType('array{string, non-empty-string, numeric-string}', $matches);
517+
}
518+
}

tests/PHPStan/Analyser/nsrt/preg_match_shapes_php80.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,11 @@ function doOffsetCaptureWithUnmatchedNull(string $s): void {
1111
}
1212
assertType('array{}|array{array{string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}}', $matches);
1313
}
14+
15+
function doNonAutoCapturingModifier(string $s): void {
16+
if (preg_match('/(?n)(\d+)/', $s, $matches)) {
17+
// should be assertType('array{string}', $matches);
18+
assertType('array{string, numeric-string}', $matches);
19+
}
20+
assertType('array{}|array{string, numeric-string}', $matches);
21+
}

0 commit comments

Comments
 (0)