@@ -342,8 +342,8 @@ extension Source {
342
342
} . value
343
343
}
344
344
345
- /// Eat a scalar off the front, starting from after the
346
- /// backslash and base character (e.g. `\u` or `\x`).
345
+ /// Try to eat a scalar off the front, starting from after the backslash and
346
+ /// base character (e.g. `\u` or `\x`).
347
347
///
348
348
/// UniScalar -> 'u{' UniScalarSequence '}'
349
349
/// | 'u' HexDigit{4}
@@ -353,60 +353,60 @@ extension Source {
353
353
/// | 'o{' OctalDigit{1...} '}'
354
354
/// | '0' OctalDigit{0...3}
355
355
///
356
- mutating func expectUnicodeScalar(
357
- escapedCharacter base: Character
358
- ) throws -> AST . Atom . Kind {
356
+ mutating func lexUnicodeScalar( ) throws -> AST . Atom . Kind ? {
359
357
try recordLoc { src in
358
+ try src. tryEating { src in
360
359
361
- func nullScalar( ) -> AST . Atom . Kind {
362
- let pos = src. currentPosition
363
- return . scalar( . init( UnicodeScalar ( 0 ) , SourceLocation ( pos ..< pos) ) )
364
- }
365
-
366
- // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
367
- switch base {
368
- // Hex numbers.
369
- case " u " where src. tryEat ( " { " ) :
370
- return try src. expectUnicodeScalarSequence ( eating: " } " )
371
-
372
- case " x " where src. tryEat ( " { " ) :
373
- let str = try src. lexUntil ( eating: " } " )
374
- return . scalar( try Source . validateUnicodeScalar ( str, . hex) )
375
-
376
- case " x " :
377
- // \x expects *up to* 2 digits.
378
- guard let digits = src. tryEatLocatedPrefix ( maxLength: 2 , \. isHexDigit)
379
- else {
380
- // In PCRE, \x without any valid hex digits is \u{0}.
381
- // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
382
- // could be changed to throw an error if we had a parsing mode for
383
- // them.
384
- return nullScalar ( )
360
+ func nullScalar( ) -> AST . Atom . Kind {
361
+ let pos = src. currentPosition
362
+ return . scalar( . init( UnicodeScalar ( 0 ) , SourceLocation ( pos ..< pos) ) )
385
363
}
386
- return . scalar( try Source . validateUnicodeScalar ( digits, . hex) )
387
364
388
- case " u " :
389
- return . scalar( try src. expectUnicodeScalar ( numDigits: 4 ) )
390
- case " U " :
391
- return . scalar( try src. expectUnicodeScalar ( numDigits: 8 ) )
365
+ // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
366
+ switch src. tryEat ( ) {
367
+ // Hex numbers.
368
+ case " u " where src. tryEat ( " { " ) :
369
+ return try src. expectUnicodeScalarSequence ( eating: " } " )
370
+
371
+ case " x " where src. tryEat ( " { " ) :
372
+ let str = try src. lexUntil ( eating: " } " )
373
+ return . scalar( try Source . validateUnicodeScalar ( str, . hex) )
374
+
375
+ case " x " :
376
+ // \x expects *up to* 2 digits.
377
+ guard let digits = src. tryEatLocatedPrefix ( maxLength: 2 , \. isHexDigit)
378
+ else {
379
+ // In PCRE, \x without any valid hex digits is \u{0}.
380
+ // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
381
+ // could be changed to throw an error if we had a parsing mode for
382
+ // them.
383
+ return nullScalar ( )
384
+ }
385
+ return . scalar( try Source . validateUnicodeScalar ( digits, . hex) )
386
+
387
+ case " u " :
388
+ return . scalar( try src. expectUnicodeScalar ( numDigits: 4 ) )
389
+ case " U " :
390
+ return . scalar( try src. expectUnicodeScalar ( numDigits: 8 ) )
391
+
392
+ // Octal numbers.
393
+ case " o " where src. tryEat ( " { " ) :
394
+ let str = try src. lexUntil ( eating: " } " )
395
+ return . scalar( try Source . validateUnicodeScalar ( str, . octal) )
396
+
397
+ case " 0 " :
398
+ // We can read *up to* 3 more octal digits.
399
+ // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
400
+ // PCRE mode, we should limit it here.
401
+ guard let digits = src. tryEatLocatedPrefix ( maxLength: 3 , \. isOctalDigit)
402
+ else {
403
+ return nullScalar ( )
404
+ }
405
+ return . scalar( try Source . validateUnicodeScalar ( digits, . octal) )
392
406
393
- // Octal numbers.
394
- case " o " where src. tryEat ( " { " ) :
395
- let str = try src. lexUntil ( eating: " } " )
396
- return . scalar( try Source . validateUnicodeScalar ( str, . octal) )
397
-
398
- case " 0 " :
399
- // We can read *up to* 3 more octal digits.
400
- // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
401
- // PCRE mode, we should limit it here.
402
- guard let digits = src. tryEatLocatedPrefix ( maxLength: 3 , \. isOctalDigit)
403
- else {
404
- return nullScalar ( )
407
+ default :
408
+ return nil
405
409
}
406
- return . scalar( try Source . validateUnicodeScalar ( digits, . octal) )
407
-
408
- default :
409
- fatalError ( " Unexpected scalar start " )
410
410
}
411
411
} . value
412
412
}
@@ -579,7 +579,7 @@ extension Source {
579
579
580
580
/// Try to consume quoted content
581
581
///
582
- /// Quote -> '\Q' (!'\E' .)* '\E'
582
+ /// Quote -> '\Q' (!'\E' .)* '\E'?
583
583
///
584
584
/// With `SyntaxOptions.experimentalQuotes`, also accepts
585
585
///
@@ -592,9 +592,24 @@ extension Source {
592
592
mutating func lexQuote( context: ParsingContext ) throws -> AST . Quote ? {
593
593
let str = try recordLoc { src -> String ? in
594
594
if src. tryEat ( sequence: #"\Q"# ) {
595
- return try src. expectQuoted ( endingWith: #"\E"# ) . value
595
+ let contents = src. lexUntil { src in
596
+ src. isEmpty || src. tryEat ( sequence: #"\E"# )
597
+ } . value
598
+
599
+ // In multi-line literals, the quote may not span multiple lines.
600
+ if context. syntax. contains ( . multilineCompilerLiteral) ,
601
+ contents. spansMultipleLinesInRegexLiteral {
602
+ throw ParseError . quoteMayNotSpanMultipleLines
603
+ }
604
+
605
+ // The sequence must not be empty in a custom character class.
606
+ if context. isInCustomCharacterClass && contents. isEmpty {
607
+ throw ParseError . expectedNonEmptyContents
608
+ }
609
+ return contents
596
610
}
597
611
if context. experimentalQuotes, src. tryEat ( " \" " ) {
612
+ // TODO: Can experimental quotes be empty?
598
613
return try src. expectQuoted ( endingWith: " \" " , ignoreEscaped: true ) . value
599
614
}
600
615
return nil
@@ -787,6 +802,11 @@ extension Source {
787
802
mutating func lexMatchingOptionSequence(
788
803
context: ParsingContext
789
804
) throws -> AST . MatchingOptionSequence ? {
805
+ // PCRE accepts '(?)'
806
+ // TODO: This is a no-op, should we warn?
807
+ if peek ( ) == " ) " {
808
+ return . init( caretLoc: nil , adding: [ ] , minusLoc: nil , removing: [ ] )
809
+ }
790
810
let ateCaret = recordLoc { $0. tryEat ( " ^ " ) }
791
811
792
812
// TODO: Warn on duplicate options, and options appearing in both adding
@@ -820,11 +840,6 @@ extension Source {
820
840
if opt. isSemanticMatchingLevel {
821
841
throw ParseError . cannotRemoveSemanticsOptions
822
842
}
823
- // Extended syntax may not be removed if in multi-line mode.
824
- if context. syntax. contains ( . multilineExtendedSyntax) &&
825
- opt. isAnyExtended {
826
- throw ParseError . cannotRemoveExtendedSyntaxInMultilineMode
827
- }
828
843
removing. append ( opt)
829
844
}
830
845
return . init( caretLoc: nil , adding: adding, minusLoc: ateMinus. location,
@@ -1692,6 +1707,11 @@ extension Source {
1692
1707
return ref
1693
1708
}
1694
1709
1710
+ // Hexadecimal and octal unicode scalars.
1711
+ if let scalar = try src. lexUnicodeScalar ( ) {
1712
+ return scalar
1713
+ }
1714
+
1695
1715
guard let char = src. tryEat ( ) else {
1696
1716
throw ParseError . expectedEscape
1697
1717
}
@@ -1703,14 +1723,6 @@ extension Source {
1703
1723
return . escaped( builtin)
1704
1724
}
1705
1725
1706
- switch char {
1707
- // Hexadecimal and octal unicode scalars.
1708
- case " u " , " x " , " U " , " o " , " 0 " :
1709
- return try src. expectUnicodeScalar ( escapedCharacter: char)
1710
- default :
1711
- break
1712
- }
1713
-
1714
1726
// We only allow unknown escape sequences for non-letter non-number ASCII,
1715
1727
// and non-ASCII whitespace.
1716
1728
// TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.
0 commit comments