Skip to content

Commit a9ec512

Browse files
authored
Fix a few regex class parsing usecases
1 parent d3bc3e3 commit a9ec512

File tree

3 files changed

+136
-9
lines changed

3 files changed

+136
-9
lines changed

resources/RegexGrammar.pp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,16 @@
4242
//
4343

4444
// Character classes.
45+
// tokens suffixed with "fc_" are the same as without such suffix but followed by "class:_class"
46+
%token negative_class_fc_ \[\^(?=\]) -> class_fc
47+
%token class_fc_ \[(?=\]) -> class_fc
48+
%token class_fc:_class \] -> class
4549
%token negative_class_ \[\^ -> class
4650
%token class_ \[ -> class
4751
%token class:posix_class \[:\^?[a-z]+:\]
4852
%token class:class_ \[
49-
%token class:_class_literal (?<=[^\\]\[|[^\\]\[\^)\]
5053
%token class:_class \] -> default
5154
%token class:range \-
52-
%token class:escaped_end_class \\\]
5355
// taken over from literals but class:character has \b support on top (backspace in character classes)
5456
%token class:character \\([aefnrtb]|c[\x00-\x7f])
5557
%token class:dynamic_character \\([0-7]{3}|x[0-9a-zA-Z]{2}|x{[0-9a-zA-Z]+})
@@ -58,7 +60,8 @@
5860

5961
// Internal options.
6062
// See https://www.regular-expressions.info/refmodifiers.html
61-
%token internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx)\)
63+
// and https://www.php.net/manual/en/regexp.reference.internal-options.php
64+
%token internal_option \(\?[imsxnJUX^]*-?[imsxnJUX^]+\)
6265

6366
// Lookahead and lookbehind assertions.
6467
%token lookahead_ \(\?=
@@ -88,7 +91,7 @@
8891
%token nc:_named_capturing > -> default
8992
%token nc:capturing_name .+?(?=(?<!\\)>)
9093
%token non_capturing_ \(\?:
91-
%token non_capturing_internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx):
94+
%token non_capturing_internal_option \(\?[imsxnJUX^]*-?[imsxnJUX^]+:
9295
%token non_capturing_reset_ \(\?\|
9396
%token atomic_group_ \(\?>
9497
%token capturing_ \(
@@ -177,10 +180,14 @@
177180

178181
#class:
179182
(
180-
::negative_class_:: #negativeclass
183+
::negative_class_fc_:: #negativeclass
184+
<_class>
185+
| ::class_fc_::
186+
<_class>
187+
| ::negative_class_:: #negativeclass
181188
| ::class_::
182189
)
183-
( <range> | <_class_literal> )? ( <posix_class> | <class_> | range() | literal() | <escaped_end_class> )* <range>?
190+
<range>? ( <posix_class> | <class_> | range() <range>? | literal() )* <range>?
184191
::_class::
185192

186193
#range:

src/Type/Regex/RegexGroupParser.php

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -525,11 +525,11 @@ private function getLiteralValue(TreeNode $node, ?array &$onlyLiterals, bool $ap
525525

526526
if (
527527
in_array($token, [
528-
'literal', 'escaped_end_class',
528+
'literal',
529529
// literal "-" in front/back of a character class like '[-a-z]' or '[abc-]', not forming a range
530530
'range',
531531
// literal "[" or "]" inside character classes '[[]' or '[]]'
532-
'class_', '_class_literal',
532+
'class_', '_class',
533533
], true)
534534
) {
535535
if (str_contains($patternModifiers, 'x') && trim($value) === '') {
@@ -544,7 +544,6 @@ private function getLiteralValue(TreeNode $node, ?array &$onlyLiterals, bool $ap
544544

545545
if (
546546
$appendLiterals
547-
&& in_array($token, ['literal', 'range', 'class_', '_class_literal'], true)
548547
&& $onlyLiterals !== null
549548
&& (!in_array($value, ['.'], true) || $isEscaped || $inCharacterClass)
550549
) {

tests/PHPStan/Analyser/nsrt/preg_match_shapes.php

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,9 @@ function bug11323(string $s): void {
467467
if (preg_match('{([-\p{L}[\]*|\x03\a\b+?{}(?:)-]+[^[:digit:]?{}a-z0-9#-k]+)(a-z)}', $s, $matches)) {
468468
assertType("array{string, non-falsy-string, 'a-z'}", $matches);
469469
}
470+
if (preg_match('{(\d+)(?i)insensitive((?xs-i)case SENSITIVE here.+and dot matches new lines)}', $s, $matches)) {
471+
assertType('array{string, numeric-string, non-falsy-string}', $matches);
472+
}
470473
if (preg_match('{(\d+)(?i)insensitive((?x-i)case SENSITIVE here(?i:insensitive non-capturing group))}', $s, $matches)) {
471474
assertType('array{string, numeric-string, non-falsy-string}', $matches);
472475
}
@@ -778,3 +781,121 @@ function testLtrimDelimiter (string $string): void {
778781
assertType("array{string, 'x'}", $matches);
779782
}
780783
}
784+
785+
function testUnescapeBackslash (string $string): void {
786+
if (preg_match(<<<'EOD'
787+
~(\[)~
788+
EOD, $string, $matches)) {
789+
assertType("array{string, '['}", $matches);
790+
}
791+
792+
if (preg_match(<<<'EOD'
793+
~(\d)~
794+
EOD, $string, $matches)) {
795+
assertType("array{string, numeric-string}", $matches);
796+
}
797+
798+
if (preg_match(<<<'EOD'
799+
~(\\d)~
800+
EOD, $string, $matches)) {
801+
assertType("array{string, '\\\d'}", $matches);
802+
}
803+
804+
if (preg_match(<<<'EOD'
805+
~(\\\d)~
806+
EOD, $string, $matches)) {
807+
assertType("array{string, non-falsy-string}", $matches);
808+
}
809+
810+
if (preg_match(<<<'EOD'
811+
~(\\\\d)~
812+
EOD, $string, $matches)) {
813+
assertType("array{string, '\\\\\\\d'}", $matches);
814+
}
815+
}
816+
817+
function testEscapedDelimiter (string $string): void {
818+
if (preg_match(<<<'EOD'
819+
/(\/)/
820+
EOD, $string, $matches)) {
821+
assertType("array{string, '/'}", $matches);
822+
}
823+
824+
if (preg_match(<<<'EOD'
825+
~(\~)~
826+
EOD, $string, $matches)) {
827+
assertType("array{string, '~'}", $matches);
828+
}
829+
830+
if (preg_match(<<<'EOD'
831+
~(\[2])~
832+
EOD, $string, $matches)) {
833+
assertType("array{string, '[2]'}", $matches);
834+
}
835+
836+
if (preg_match(<<<'EOD'
837+
[(\[2\])]
838+
EOD, $string, $matches)) {
839+
assertType("array{string, '[2]'}", $matches);
840+
}
841+
842+
if (preg_match(<<<'EOD'
843+
~(\{2})~
844+
EOD, $string, $matches)) {
845+
assertType("array{string, '{2}'}", $matches);
846+
}
847+
848+
if (preg_match(<<<'EOD'
849+
{(\{2\})}
850+
EOD, $string, $matches)) {
851+
assertType("array{string, '{2}'}", $matches);
852+
}
853+
854+
if (preg_match(<<<'EOD'
855+
~([a\]])~
856+
EOD, $string, $matches)) {
857+
assertType("array{string, ']'|'a'}", $matches);
858+
}
859+
860+
if (preg_match(<<<'EOD'
861+
~([a[])~
862+
EOD, $string, $matches)) {
863+
assertType("array{string, '['|'a'}", $matches);
864+
}
865+
866+
if (preg_match(<<<'EOD'
867+
~([a\]b])~
868+
EOD, $string, $matches)) {
869+
assertType("array{string, ']'|'a'|'b'}", $matches);
870+
}
871+
872+
if (preg_match(<<<'EOD'
873+
~([a[b])~
874+
EOD, $string, $matches)) {
875+
assertType("array{string, '['|'a'|'b'}", $matches);
876+
}
877+
878+
if (preg_match(<<<'EOD'
879+
~([a\[b])~
880+
EOD, $string, $matches)) {
881+
assertType("array{string, '['|'a'|'b'}", $matches);
882+
}
883+
884+
if (preg_match(<<<'EOD'
885+
[([a\[b])]
886+
EOD, $string, $matches)) {
887+
assertType("array{string, '['|'a'|'b'}", $matches);
888+
}
889+
890+
if (preg_match(<<<'EOD'
891+
{(x\\\{)|(y\\\\\})}
892+
EOD, $string, $matches)) {
893+
assertType("array{string, '', 'y\\\\\\\}'}|array{string, 'x\\\{'}", $matches);
894+
}
895+
}
896+
897+
function bugUnescapedDashAfterRange (string $string): void {
898+
if (preg_match('/([0-1-y])/', $string, $matches)) {
899+
assertType("array{string, non-empty-string}", $matches);
900+
}
901+
}

0 commit comments

Comments
 (0)