Skip to content

Multi-line and extended delimiter literal lexing #242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Sources/_RegexParser/Regex/AST/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ extension AST {
/// Global matching option specifiers. Unlike `MatchingOptionSequence`,
/// these must appear at the start of the pattern, and apply globally.
public struct GlobalMatchingOption: _ASTNode, Hashable {
/// Determines the definition of a newline for the '.' character class.
/// Determines the definition of a newline for the '.' character class and
/// when parsing end-of-line comments.
public enum NewlineMatching: Hashable {
/// (*CR*)
case carriageReturnOnly
Expand Down
238 changes: 185 additions & 53 deletions Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,78 @@
//
//===----------------------------------------------------------------------===//

// TODO: mock up multi-line soon

enum Delimiter: Hashable, CaseIterable {
case traditional
case experimental
case reSingleQuote
case rxSingleQuote

var openingAndClosing: (opening: String, closing: String) {
switch self {
case .traditional: return ("#/", "/#")
case .experimental: return ("#|", "|#")
case .reSingleQuote: return ("re'", "'")
case .rxSingleQuote: return ("rx'", "'")
struct Delimiter: Hashable {
let kind: Kind
let poundCount: Int

init(_ kind: Kind, poundCount: Int) {
precondition(kind.allowsExtendedPoundSyntax || poundCount == 0)
self.kind = kind
self.poundCount = poundCount
}

var opening: String {
String(repeating: "#", count: poundCount) + kind.opening
}
var closing: String {
kind.closing + String(repeating: "#", count: poundCount)
}

/// Whether or not multi-line mode is permitted.
var allowsMultiline: Bool {
switch kind {
case .forwardSlash:
return poundCount > 0
case .experimental, .reSingleQuote, .rxSingleQuote:
return false
}
}
var opening: String { openingAndClosing.opening }
var closing: String { openingAndClosing.closing }

/// The default set of syntax options that the delimiter indicates.
var defaultSyntaxOptions: SyntaxOptions {
switch self {
case .traditional, .reSingleQuote:
return .traditional
case .experimental, .rxSingleQuote:
return .experimental

/// The delimiters which are currently enabled.
static var enabledDelimiters: [Kind] { [.forwardSlash] }

/// All known delimiters.
static var allDelimiters: [Kind] { Kind.allCases }
}

extension Delimiter {
enum Kind: Hashable, CaseIterable {
case forwardSlash
case experimental
case reSingleQuote
case rxSingleQuote

var openingAndClosing: (opening: String, closing: String) {
switch self {
case .forwardSlash: return ("/", "/")
case .experimental: return ("#|", "|#")
case .reSingleQuote: return ("re'", "'")
case .rxSingleQuote: return ("rx'", "'")
}
}
var opening: String { openingAndClosing.opening }
var closing: String { openingAndClosing.closing }

/// Whether or not extended pound syntax e.g `##/.../##` is allowed with
/// this delimiter.
var allowsExtendedPoundSyntax: Bool {
switch self {
case .forwardSlash:
return true
case .experimental, .reSingleQuote, .rxSingleQuote:
return false
}
}
}
}

struct DelimiterLexError: Error, CustomStringConvertible {
enum Kind: Hashable {
case endOfString
case unterminated
case invalidUTF8 // TODO: better range reporting
case unknownDelimiter
case unprintableASCII
case multilineClosingNotOnNewline
}

var kind: Kind
Expand All @@ -59,10 +95,11 @@ struct DelimiterLexError: Error, CustomStringConvertible {

var description: String {
switch kind {
case .endOfString: return "unterminated regex literal"
case .unterminated: return "unterminated regex literal"
case .invalidUTF8: return "invalid UTF-8 found in source file"
case .unknownDelimiter: return "unknown regex literal delimiter"
case .unprintableASCII: return "unprintable ASCII character found in source file"
case .multilineClosingNotOnNewline: return "closing delimiter must appear on new line"
}
}
}
Expand All @@ -72,11 +109,18 @@ fileprivate struct DelimiterLexer {
var cursor: UnsafeRawPointer
let end: UnsafeRawPointer

init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
var firstNewline: UnsafeRawPointer?
var isMultiline: Bool { firstNewline != nil }

let delimiters: [Delimiter.Kind]

init(start: UnsafeRawPointer, end: UnsafeRawPointer,
delimiters: [Delimiter.Kind]) {
precondition(start <= end)
self.start = start
self.cursor = start
self.end = end
self.delimiters = delimiters
}

func ascii(_ s: Unicode.Scalar) -> UInt8 {
Expand Down Expand Up @@ -120,25 +164,34 @@ fileprivate struct DelimiterLexer {
precondition(cursor <= end, "Cannot advance past end")
}

/// Check to see if a UTF-8 sequence can be eaten from the current cursor.
func canEat(_ utf8: String.UTF8View) -> Bool {
guard let slice = slice(utf8.count) else { return false }
return slice.elementsEqual(utf8)
/// Check to see if a byte sequence can be eaten from the current cursor.
func canEat<C : Collection>(_ bytes: C) -> Bool where C.Element == UInt8 {
guard let slice = slice(bytes.count) else { return false }
return slice.elementsEqual(bytes)
}

/// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
guard canEat(utf8) else { return false }
advanceCursor(utf8.count)
/// Attempt to eat a byte sequence, returning `true` if successful.
mutating func tryEat<C : Collection>(
_ bytes: C
) -> Bool where C.Element == UInt8 {
guard canEat(bytes) else { return false }
advanceCursor(bytes.count)
return true
}

/// Attempt to eat an ascii scalar, returning `true` if successful.
mutating func tryEat(ascii s: Unicode.Scalar) -> Bool {
guard load() == ascii(s) else { return false }
advanceCursor()
return true
}

/// Attempt to skip over a closing delimiter character that is unlikely to be
/// the actual closing delimiter.
mutating func trySkipDelimiter(_ delimiter: Delimiter) {
// Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
switch delimiter {
case .traditional, .experimental:
switch delimiter.kind {
case .forwardSlash, .experimental:
return
case .reSingleQuote, .rxSingleQuote:
break
Expand Down Expand Up @@ -222,12 +275,23 @@ fileprivate struct DelimiterLexer {
let contentsEnd = cursor
guard tryEat(delimiter.closing.utf8) else { return nil }

// Form a string from the contents and make sure it's valid UTF-8.
let count = contentsEnd - contentsStart
let contents = UnsafeRawBufferPointer(
start: contentsStart, count: count)
let s = String(decoding: contents, as: UTF8.self)

// In multi-line mode, we must be on a new line. So scan backwards and make
// sure we only have whitespace until the newline.
if isMultiline {
let idx = contents.lastIndex(
where: { $0 == ascii("\n") || $0 == ascii("\r") })! + 1
guard contents[idx...].all({ $0 == ascii(" ") || $0 == ascii("\t") })
else {
throw DelimiterLexError(.multilineClosingNotOnNewline, resumeAt: cursor)
}
}

// Form a string from the contents and make sure it's valid UTF-8.
let s = String(decoding: contents, as: UTF8.self)
guard s.utf8.elementsEqual(contents) else {
throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
}
Expand All @@ -238,7 +302,10 @@ fileprivate struct DelimiterLexer {
/// the end of the buffer is reached.
mutating func advance(escaped: Bool = false) throws {
guard let next = load() else {
throw DelimiterLexError(.endOfString, resumeAt: cursor)
// We've hit the end of the buffer. In multi-line mode, we don't want to
// skip over what is likely otherwise valid Swift code, so resume from the
// first newline.
throw DelimiterLexError(.unterminated, resumeAt: firstNewline ?? cursor)
}
switch UnicodeScalar(next) {
case let next where !next.isASCII:
Expand All @@ -249,7 +316,10 @@ fileprivate struct DelimiterLexer {
advanceCursor()

case "\n", "\r":
throw DelimiterLexError(.endOfString, resumeAt: cursor)
guard isMultiline else {
throw DelimiterLexError(.unterminated, resumeAt: cursor)
}
advanceCursor()

case "\0":
// TODO: Warn to match the behavior of String literal lexer? Or should
Expand All @@ -261,8 +331,12 @@ fileprivate struct DelimiterLexer {
advanceCursor()
try advance(escaped: true)

case let next where !next.isPrintableASCII:
case let next
where !next.isPrintableASCII && !(isMultiline && next == "\t"):
// Diagnose unprintable ASCII.
// Note that tabs are allowed in multi-line literals.
// TODO: This matches the string literal behavior, but should we allow
// tabs for single-line regex literals too?
// TODO: Ideally we would recover and continue to lex until the ending
// delimiter.
throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
Expand All @@ -272,17 +346,60 @@ fileprivate struct DelimiterLexer {
}
}

mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? {
for kind in delimiters {
// If the delimiter allows extended pound syntax, or there are no pounds,
// we just need to lex it.
let opening = kind.opening.utf8
if kind.allowsExtendedPoundSyntax || poundCount == 0 {
guard tryEat(opening) else { continue }
return Delimiter(kind, poundCount: poundCount)
}

// The delimiter doesn't allow extended pound syntax, so the pounds must be
// part of the delimiter.
guard
poundCount < opening.count,
opening.prefix(poundCount)
.elementsEqual(repeatElement(ascii("#"), count: poundCount)),
tryEat(opening.dropFirst(poundCount))
else { continue }

return Delimiter(kind, poundCount: 0)
}
return nil
}

/*consuming*/ mutating func lex(
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
// We can consume any number of pound signs.
var poundCount = 0
while tryEat(ascii: "#") {
poundCount += 1
}

// Try to lex the opening delimiter.
guard let delimiter = Delimiter.allCases.first(
where: { tryEat($0.opening.utf8) }
) else {
guard let delimiter = tryLexOpeningDelimiter(poundCount: poundCount) else {
throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
}

let contentsStart = cursor

// If the delimiter allows multi-line, try skipping over any whitespace to a
// newline character. If we can do that, we enter multi-line mode.
if delimiter.allowsMultiline {
while let next = load() {
switch next {
case ascii(" "), ascii("\t"):
advanceCursor()
continue
case ascii("\n"), ascii("\r"):
firstNewline = cursor
default:
break
}
break
}
}
while true {
// Check to see if we're at a character that looks like a delimiter, but
// likely isn't. In such a case, we can attempt to skip over it.
Expand All @@ -302,20 +419,34 @@ fileprivate struct DelimiterLexer {
/// Drop a set of regex delimiters from the input string, returning the contents
/// and the delimiter used. The input string must have valid delimiters.
func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
func stripDelimiter(_ delim: Delimiter) -> String? {
func stripDelimiter(_ kind: Delimiter.Kind) -> (String, Delimiter)? {
var slice = str.utf8[...]

// Try strip any number of opening '#'s.
var poundCount = 0
if kind.allowsExtendedPoundSyntax {
poundCount = slice.prefix(while: {
$0 == UInt8(("#" as UnicodeScalar).value)
}).count
slice = slice.dropFirst(poundCount)
}

// The opening delimiter must match.
guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8)
guard var slice = slice.tryDropPrefix(kind.opening.utf8)
else { return nil }

// The closing delimiter may optionally match, as it may not be present in
// invalid code.
let delim = Delimiter(kind, poundCount: poundCount)
if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
slice = newSlice
}
return String(slice)
let result = String(decoding: slice, as: UTF8.self)
precondition(result.utf8.elementsEqual(slice))
return (result, delim)
}
for d in Delimiter.allCases {
if let contents = stripDelimiter(d) {
for kind in Delimiter.allDelimiters {
if let (contents, d) = stripDelimiter(kind) {
return (contents, d)
}
}
Expand All @@ -325,8 +456,9 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
/// Attempt to lex a regex literal between `start` and `end`, returning either
/// the contents and pointer from which to resume lexing, or an error.
func lexRegex(
start: UnsafeRawPointer, end: UnsafeRawPointer
start: UnsafeRawPointer, end: UnsafeRawPointer,
delimiters: [Delimiter.Kind] = Delimiter.enabledDelimiters
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
var lexer = DelimiterLexer(start: start, end: end)
var lexer = DelimiterLexer(start: start, end: end, delimiters: delimiters)
return try lexer.lex()
}
4 changes: 4 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ enum ParseError: Error, Hashable {

case cannotRemoveTextSegmentOptions
case cannotRemoveSemanticsOptions
case cannotRemoveExtendedSyntaxInMultilineMode

case expectedCalloutArgument
}

Expand Down Expand Up @@ -158,6 +160,8 @@ extension ParseError: CustomStringConvertible {
return "text segment mode cannot be unset, only changed"
case .cannotRemoveSemanticsOptions:
return "semantic level cannot be unset, only changed"
case .cannotRemoveExtendedSyntaxInMultilineMode:
return "extended syntax may not be disabled in multi-line mode"
case .expectedCalloutArgument:
return "expected argument to callout"
}
Expand Down
Loading