Skip to content

Commit 9807fcd

Browse files
authored
Merge pull request #493 from hamishknight/syntax-tweaks-5.7
2 parents 62195a1 + 045fb95 commit 9807fcd

File tree

6 files changed

+265
-135
lines changed

6 files changed

+265
-135
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ enum ParseError: Error, Hashable {
4444
case invalidEscape(Character)
4545
case confusableCharacter(Character)
4646

47+
case quoteMayNotSpanMultipleLines
48+
case unsetExtendedSyntaxMayNotSpanMultipleLines
49+
4750
case cannotReferToWholePattern
4851

4952
case quantifierRequiresOperand(String)
@@ -79,6 +82,7 @@ enum ParseError: Error, Hashable {
7982
case cannotRemoveTextSegmentOptions
8083
case cannotRemoveSemanticsOptions
8184
case cannotRemoveExtendedSyntaxInMultilineMode
85+
case cannotResetExtendedSyntaxInMultilineMode
8286

8387
case expectedCalloutArgument
8488

@@ -139,6 +143,10 @@ extension ParseError: CustomStringConvertible {
139143
return "invalid escape sequence '\\\(c)'"
140144
case .confusableCharacter(let c):
141145
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
146+
case .quoteMayNotSpanMultipleLines:
147+
return "quoted sequence may not span multiple lines in multi-line literal"
148+
case .unsetExtendedSyntaxMayNotSpanMultipleLines:
149+
return "group that unsets extended syntax may not span multiple lines in multi-line literal"
142150
case .cannotReferToWholePattern:
143151
return "cannot refer to whole pattern here"
144152
case .quantifierRequiresOperand(let q):
@@ -190,6 +198,8 @@ extension ParseError: CustomStringConvertible {
190198
return "semantic level cannot be unset, only changed"
191199
case .cannotRemoveExtendedSyntaxInMultilineMode:
192200
return "extended syntax may not be disabled in multi-line mode"
201+
case .cannotResetExtendedSyntaxInMultilineMode:
202+
return "extended syntax may not be disabled in multi-line mode; use '(?^x)' instead"
193203
case .expectedCalloutArgument:
194204
return "expected argument to callout"
195205
case .unrecognizedScript(let value):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 77 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -342,8 +342,8 @@ extension Source {
342342
}.value
343343
}
344344

345-
/// Eat a scalar off the front, starting from after the
346-
/// backslash and base character (e.g. `\u` or `\x`).
345+
/// Try to eat a scalar off the front, starting from after the backslash and
346+
/// base character (e.g. `\u` or `\x`).
347347
///
348348
/// UniScalar -> 'u{' UniScalarSequence '}'
349349
/// | 'u' HexDigit{4}
@@ -353,60 +353,60 @@ extension Source {
353353
/// | 'o{' OctalDigit{1...} '}'
354354
/// | '0' OctalDigit{0...3}
355355
///
356-
mutating func expectUnicodeScalar(
357-
escapedCharacter base: Character
358-
) throws -> AST.Atom.Kind {
356+
mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? {
359357
try recordLoc { src in
358+
try src.tryEating { src in
360359

361-
func nullScalar() -> AST.Atom.Kind {
362-
let pos = src.currentPosition
363-
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
364-
}
365-
366-
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
367-
switch base {
368-
// Hex numbers.
369-
case "u" where src.tryEat("{"):
370-
return try src.expectUnicodeScalarSequence(eating: "}")
371-
372-
case "x" where src.tryEat("{"):
373-
let str = try src.lexUntil(eating: "}")
374-
return .scalar(try Source.validateUnicodeScalar(str, .hex))
375-
376-
case "x":
377-
// \x expects *up to* 2 digits.
378-
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
379-
else {
380-
// In PCRE, \x without any valid hex digits is \u{0}.
381-
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
382-
// could be changed to throw an error if we had a parsing mode for
383-
// them.
384-
return nullScalar()
360+
func nullScalar() -> AST.Atom.Kind {
361+
let pos = src.currentPosition
362+
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
385363
}
386-
return .scalar(try Source.validateUnicodeScalar(digits, .hex))
387364

388-
case "u":
389-
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
390-
case "U":
391-
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
365+
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
366+
switch src.tryEat() {
367+
// Hex numbers.
368+
case "u" where src.tryEat("{"):
369+
return try src.expectUnicodeScalarSequence(eating: "}")
370+
371+
case "x" where src.tryEat("{"):
372+
let str = try src.lexUntil(eating: "}")
373+
return .scalar(try Source.validateUnicodeScalar(str, .hex))
374+
375+
case "x":
376+
// \x expects *up to* 2 digits.
377+
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
378+
else {
379+
// In PCRE, \x without any valid hex digits is \u{0}.
380+
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
381+
// could be changed to throw an error if we had a parsing mode for
382+
// them.
383+
return nullScalar()
384+
}
385+
return .scalar(try Source.validateUnicodeScalar(digits, .hex))
386+
387+
case "u":
388+
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
389+
case "U":
390+
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
391+
392+
// Octal numbers.
393+
case "o" where src.tryEat("{"):
394+
let str = try src.lexUntil(eating: "}")
395+
return .scalar(try Source.validateUnicodeScalar(str, .octal))
396+
397+
case "0":
398+
// We can read *up to* 3 more octal digits.
399+
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
400+
// PCRE mode, we should limit it here.
401+
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
402+
else {
403+
return nullScalar()
404+
}
405+
return .scalar(try Source.validateUnicodeScalar(digits, .octal))
392406

393-
// Octal numbers.
394-
case "o" where src.tryEat("{"):
395-
let str = try src.lexUntil(eating: "}")
396-
return .scalar(try Source.validateUnicodeScalar(str, .octal))
397-
398-
case "0":
399-
// We can read *up to* 3 more octal digits.
400-
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
401-
// PCRE mode, we should limit it here.
402-
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
403-
else {
404-
return nullScalar()
407+
default:
408+
return nil
405409
}
406-
return .scalar(try Source.validateUnicodeScalar(digits, .octal))
407-
408-
default:
409-
fatalError("Unexpected scalar start")
410410
}
411411
}.value
412412
}
@@ -579,7 +579,7 @@ extension Source {
579579

580580
/// Try to consume quoted content
581581
///
582-
/// Quote -> '\Q' (!'\E' .)* '\E'
582+
/// Quote -> '\Q' (!'\E' .)* '\E'?
583583
///
584584
/// With `SyntaxOptions.experimentalQuotes`, also accepts
585585
///
@@ -592,9 +592,24 @@ extension Source {
592592
mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? {
593593
let str = try recordLoc { src -> String? in
594594
if src.tryEat(sequence: #"\Q"#) {
595-
return try src.expectQuoted(endingWith: #"\E"#).value
595+
let contents = src.lexUntil { src in
596+
src.isEmpty || src.tryEat(sequence: #"\E"#)
597+
}.value
598+
599+
// In multi-line literals, the quote may not span multiple lines.
600+
if context.syntax.contains(.multilineCompilerLiteral),
601+
contents.spansMultipleLinesInRegexLiteral {
602+
throw ParseError.quoteMayNotSpanMultipleLines
603+
}
604+
605+
// The sequence must not be empty in a custom character class.
606+
if context.isInCustomCharacterClass && contents.isEmpty {
607+
throw ParseError.expectedNonEmptyContents
608+
}
609+
return contents
596610
}
597611
if context.experimentalQuotes, src.tryEat("\"") {
612+
// TODO: Can experimental quotes be empty?
598613
return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value
599614
}
600615
return nil
@@ -787,6 +802,11 @@ extension Source {
787802
mutating func lexMatchingOptionSequence(
788803
context: ParsingContext
789804
) throws -> AST.MatchingOptionSequence? {
805+
// PCRE accepts '(?)'
806+
// TODO: This is a no-op, should we warn?
807+
if peek() == ")" {
808+
return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: [])
809+
}
790810
let ateCaret = recordLoc { $0.tryEat("^") }
791811

792812
// TODO: Warn on duplicate options, and options appearing in both adding
@@ -820,11 +840,6 @@ extension Source {
820840
if opt.isSemanticMatchingLevel {
821841
throw ParseError.cannotRemoveSemanticsOptions
822842
}
823-
// Extended syntax may not be removed if in multi-line mode.
824-
if context.syntax.contains(.multilineExtendedSyntax) &&
825-
opt.isAnyExtended {
826-
throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
827-
}
828843
removing.append(opt)
829844
}
830845
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,
@@ -1692,6 +1707,11 @@ extension Source {
16921707
return ref
16931708
}
16941709

1710+
// Hexadecimal and octal unicode scalars.
1711+
if let scalar = try src.lexUnicodeScalar() {
1712+
return scalar
1713+
}
1714+
16951715
guard let char = src.tryEat() else {
16961716
throw ParseError.expectedEscape
16971717
}
@@ -1703,14 +1723,6 @@ extension Source {
17031723
return .escaped(builtin)
17041724
}
17051725

1706-
switch char {
1707-
// Hexadecimal and octal unicode scalars.
1708-
case "u", "x", "U", "o", "0":
1709-
return try src.expectUnicodeScalar(escapedCharacter: char)
1710-
default:
1711-
break
1712-
}
1713-
17141726
// We only allow unknown escape sequences for non-letter non-number ASCII,
17151727
// and non-ASCII whitespace.
17161728
// TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -289,8 +289,8 @@ extension Parser {
289289
/// Apply the syntax options of a given matching option sequence to the
290290
/// current set of options.
291291
private mutating func applySyntaxOptions(
292-
of opts: AST.MatchingOptionSequence
293-
) {
292+
of opts: AST.MatchingOptionSequence, isScoped: Bool
293+
) throws {
294294
func mapOption(_ option: SyntaxOptions,
295295
_ pred: (AST.MatchingOption) -> Bool) {
296296
if opts.resetsCurrentOptions {
@@ -311,22 +311,41 @@ extension Parser {
311311
mapOption(.namedCapturesOnly, .namedCapturesOnly)
312312

313313
// (?x), (?xx)
314-
// We skip this for multi-line, as extended syntax is always enabled there.
314+
// This cannot be unset in a multi-line literal, unless in a scoped group
315+
// e.g (?-x:...). We later enforce that such a group does not span multiple
316+
// lines.
315317
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
316318
// handles non-semantic whitespace in a custom character class. Other
317319
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
318320
// treat (?x) and (?xx) as the same option here. If we ever get a strict
319321
// PCRE mode, we will need to change this to handle that.
320-
if !context.syntax.contains(.multilineExtendedSyntax) {
322+
if !isScoped && context.syntax.contains(.multilineCompilerLiteral) {
323+
// An unscoped removal of extended syntax is not allowed in a multi-line
324+
// literal.
325+
if let opt = opts.removing.first(where: \.isAnyExtended) {
326+
throw Source.LocatedError(
327+
ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location)
328+
}
329+
if opts.resetsCurrentOptions {
330+
throw Source.LocatedError(
331+
ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!)
332+
}
333+
// The only remaning case is an unscoped addition of extended syntax,
334+
// which is a no-op.
335+
} else {
336+
// We either have a scoped change of extended syntax, or this is a
337+
// single-line literal.
321338
mapOption(.extendedSyntax, \.isAnyExtended)
322339
}
323340
}
324341

325342
/// Apply the syntax options of a matching option changing group to the
326343
/// current set of options.
327-
private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
344+
private mutating func applySyntaxOptions(
345+
of group: AST.Group.Kind, isScoped: Bool
346+
) throws {
328347
if case .changeMatchingOptions(let seq) = group {
329-
applySyntaxOptions(of: seq)
348+
try applySyntaxOptions(of: seq, isScoped: isScoped)
330349
}
331350
}
332351

@@ -337,14 +356,25 @@ extension Parser {
337356
context.recordGroup(kind.value)
338357

339358
let currentSyntax = context.syntax
340-
applySyntaxOptions(of: kind.value)
359+
try applySyntaxOptions(of: kind.value, isScoped: true)
341360
defer {
342361
context.syntax = currentSyntax
343362
}
344-
363+
let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) &&
364+
!context.syntax.contains(.extendedSyntax)
345365
let child = try parseNode()
346366
try source.expect(")")
347-
return .init(kind, child, loc(start))
367+
let groupLoc = loc(start)
368+
369+
// In multi-line literals, the body of a group that unsets extended syntax
370+
// may not span multiple lines.
371+
if unsetsExtendedSyntax &&
372+
context.syntax.contains(.multilineCompilerLiteral) &&
373+
source[child.location.range].spansMultipleLinesInRegexLiteral {
374+
throw Source.LocatedError(
375+
ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc)
376+
}
377+
return .init(kind, child, groupLoc)
348378
}
349379

350380
/// Consume the body of an absent function.
@@ -438,7 +468,7 @@ extension Parser {
438468
// If we have a change matching options atom, apply the syntax options. We
439469
// already take care of scoping syntax options within a group.
440470
if case .changeMatchingOptions(let opts) = atom.kind {
441-
applySyntaxOptions(of: opts)
471+
try applySyntaxOptions(of: opts, isScoped: false)
442472
}
443473
// TODO: track source locations
444474
return .atom(atom)
@@ -592,6 +622,13 @@ public func parse<S: StringProtocol>(
592622
return ast
593623
}
594624

625+
extension StringProtocol {
626+
/// Whether the given string is considered multi-line for a regex literal.
627+
var spansMultipleLinesInRegexLiteral: Bool {
628+
unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" })
629+
}
630+
}
631+
595632
/// Retrieve the default set of syntax options that a delimiter and literal
596633
/// contents indicates.
597634
fileprivate func defaultSyntaxOptions(
@@ -601,9 +638,8 @@ fileprivate func defaultSyntaxOptions(
601638
case .forwardSlash:
602639
// For an extended syntax forward slash e.g #/.../#, extended syntax is
603640
// permitted if it spans multiple lines.
604-
if delim.poundCount > 0 &&
605-
contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) {
606-
return .multilineExtendedSyntax
641+
if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral {
642+
return [.multilineCompilerLiteral, .extendedSyntax]
607643
}
608644
return .traditional
609645
case .reSingleQuote:

Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ public struct SyntaxOptions: OptionSet {
5858
/// `(_: .*)` == `(?:.*)`
5959
public static var experimentalCaptures: Self { Self(1 << 5) }
6060

61-
/// The default syntax for a multi-line regex literal.
62-
public static var multilineExtendedSyntax: Self {
63-
return [Self(1 << 6), .extendedSyntax]
64-
}
61+
/// The syntax kind of a multi-line literal. This will always be set when
62+
/// parsing a multi-line `#/.../#` literal. Note this does not imply extended
63+
/// syntax, as that may be temporarily disabled while parsing.
64+
public static var multilineCompilerLiteral: Self { Self(1 << 6) }
6565

6666
/// `(?n)`
6767
public static var namedCapturesOnly: Self { Self(1 << 7) }
@@ -76,8 +76,8 @@ public struct SyntaxOptions: OptionSet {
7676
public static var traditional: Self { Self(0) }
7777

7878
public static var experimental: Self {
79-
// Experimental syntax enables everything except end-of-line comments.
80-
Self(~0).subtracting(.endOfLineComments)
79+
[.nonSemanticWhitespace, .experimentalQuotes, .experimentalComments,
80+
.experimentalRanges, .experimentalCaptures]
8181
}
8282

8383
// TODO: Probably want to model strict-PCRE etc. options too.

0 commit comments

Comments
 (0)