Skip to content

Add assertions to the DSL #154

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 21, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions Sources/_StringProcessing/RegexDSL/Assertion.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

import _MatchingEngine

public struct Assertion {
internal enum Kind {
case startOfSubject
case endOfSubjectBeforeNewline
case endOfSubject
case firstMatchingPositionInSubject
case textSegmentBoundary
case startOfLine
case endOfLine
case wordBoundary
case lookahead(DSLTree.Node)
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this meant to be a listing of built-in assertions, or are each of these the kinds of assertions someone could write?


var kind: Kind
var isInverted: Bool = false
}

extension Assertion: RegexProtocol {
var astAssertion: AST.Atom.AssertionKind? {
if !isInverted {
switch kind {
case .startOfSubject: return .startOfSubject
case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline
case .endOfSubject: return .endOfSubject
case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject
case .textSegmentBoundary: return .textSegment
case .startOfLine: return .startOfLine
case .endOfLine: return .endOfLine
case .wordBoundary: return .wordBoundary
default: return nil
}
} else {
switch kind {
case .startOfSubject: fatalError("Not yet supported")
case .endOfSubjectBeforeNewline: fatalError("Not yet supported")
case .endOfSubject: fatalError("Not yet supported")
case .firstMatchingPositionInSubject: fatalError("Not yet supported")
case .textSegmentBoundary: return .notTextSegment
case .startOfLine: fatalError("Not yet supported")
case .endOfLine: fatalError("Not yet supported")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't currently have a representation for these negated assertions in the AST, since things like notWordBoundary are represented as specific individual cases. These fatalError'd cases don't have a regex literal equivalent, but would be available if we use an API like Assertion.wordBoundary.inverted.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we going to an AST here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DSLTree tracks assertions using AST.Atom.AssertionKind right now.

case .wordBoundary: return .notWordBoundary
default: return nil
}
}
}

public var regex: Regex<Substring> {
if let assertionKind = astAssertion {
return Regex(node: .atom(.assertion(assertionKind)))
}

switch (kind, isInverted) {
case let (.lookahead(node), false):
return Regex(node: .group(.lookahead, node))
case let (.lookahead(node), true):
return Regex(node: .group(.negativeLookahead, node))

default:
fatalError("Unsupported assertion")
}
}
}

// MARK: - Public API

extension Assertion {
public static var startOfSubject: Assertion {
Assertion(kind: .startOfSubject)
}

public static var endOfSubjectBeforeNewline: Assertion {
Assertion(kind: .endOfSubjectBeforeNewline)
}

public static var endOfSubject: Assertion {
Assertion(kind: .endOfSubject)
}

// TODO: Are we supporting this?
// public static var resetStartOfMatch: Assertion {
// Assertion(kind: resetStartOfMatch)
// }

public static var firstMatchingPositionInSubject: Assertion {
Assertion(kind: .firstMatchingPositionInSubject)
}

public static var textSegmentBoundary: Assertion {
Assertion(kind: .textSegmentBoundary)
}

public static var startOfLine: Assertion {
Assertion(kind: .startOfLine)
}

public static var endOfLine: Assertion {
Assertion(kind: .endOfLine)
}

public static var wordBoundary: Assertion {
Assertion(kind: .wordBoundary)
}

public var inverted: Assertion {
var result = self
result.isInverted.toggle()
return result
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would we want an isInverted then? Also, is it the case that all anchors can be inverted?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know that a property makes sense if we aren't going to also expose kind as public API, and the purpose of this is just to carry the regex wrapper.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Everything in this PR can be inverted, just need a little more plumbing. If we want to provide the functionality of a "reset match" assertion, that could just be a separate function or type, since it isn't an anchor anyway.

}

extension Assertion {
public static func lookahead<R: RegexProtocol>(
@RegexBuilder _ content: () -> R
) -> Assertion {
lookahead(content())
}

public static func lookahead<R: RegexProtocol>(_ component: R) -> Assertion {
Assertion(kind: .lookahead(component.regex.root))
}
}
56 changes: 56 additions & 0 deletions Sources/_StringProcessing/RegexDSL/DSL.swift
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,59 @@ public func oneOf<R: RegexProtocol>(
) -> R {
builder()
}

// MARK: - Capture

public struct CapturingGroup<Match>: RegexProtocol {
public let regex: Regex<Match>

init<Component: RegexProtocol>(
_ component: Component
) {
self.regex = .init(node: .group(
.capture, component.regex.root))
}

init<Component: RegexProtocol>(
_ component: Component,
transform: CaptureTransform
) {
self.regex = .init(node: .groupTransform(
.capture,
component.regex.root,
transform))
}

init<NewCapture, Component: RegexProtocol>(
_ component: Component,
transform: @escaping (Substring) -> NewCapture
) {
self.init(
component,
transform: CaptureTransform(resultType: NewCapture.self) {
transform($0) as Any
})
}

init<NewCapture, Component: RegexProtocol>(
_ component: Component,
transform: @escaping (Substring) throws -> NewCapture
) {
self.init(
component,
transform: CaptureTransform(resultType: NewCapture.self) {
try transform($0) as Any
})
}

init<NewCapture, Component: RegexProtocol>(
_ component: Component,
transform: @escaping (Substring) -> NewCapture?
) {
self.init(
component,
transform: CaptureTransform(resultType: NewCapture.self) {
transform($0) as Any?
})
}
}
25 changes: 25 additions & 0 deletions Tests/RegexTests/RegexDSLTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,31 @@ class RegexDSLTests: XCTestCase {
}
}
}

func testAssertions() throws {
try _testDSLCaptures(
("aaaaab", "aaaaab"),
("caaaaab", nil),
("aaaaabc", nil),
captureType: Substring.self, ==)
{
Assertion.startOfLine
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Many of the built-in ones are more commonly called "anchors", which might be worth considering too.

"a".+
"b"
Assertion.endOfLine
}

try _testDSLCaptures(
("aaaaa1", "aaaaa1"),
("aaaaa", nil),
("aaaaab", nil),
captureType: Substring.self, ==)
{
"a".+
Assertion.lookahead(CharacterClass.digit)
CharacterClass.word
}
}

func testNestedGroups() throws {
try _testDSLCaptures(
Expand Down