Skip to content

Adds support for case insensitivity #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 50 additions & 9 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,11 @@ extension Compiler.ByteCodeGen {
emitAny()

case let .char(c):
// FIXME: Does semantic level matter?
builder.buildMatch(c)

try emitCharacter(c)

case let .scalar(s):
// TODO: Native instruction
builder.buildConsume(by: consumeScalar {
$0 == s
})

try emitScalar(s)

case let .assertion(kind):
try emitAssertion(kind)

Expand Down Expand Up @@ -135,6 +131,36 @@ extension Compiler.ByteCodeGen {
}
}
}

mutating func emitScalar(_ s: UnicodeScalar) throws {
// TODO: Native instruction buildMatchScalar(s)
if options.isCaseInsensitive {
// TODO: e.g. buildCaseInsensitiveMatchScalar(s)
builder.buildConsume(by: consumeScalar {
$0.properties.lowercaseMapping == s.properties.lowercaseMapping
})
} else {
builder.buildConsume(by: consumeScalar {
$0 == s
})
}
}

mutating func emitCharacter(_ c: Character) throws {
// FIXME: Does semantic level matter?
if options.isCaseInsensitive && c.isCased {
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
builder.buildConsume { input, bounds in
let inputChar = input[bounds.lowerBound].lowercased()
let matchChar = c.lowercased()
return inputChar == matchChar
? input.index(after: bounds.lowerBound)
: nil
}
} else {
builder.buildMatch(c)
}
}

mutating func emitAny() {
switch (options.semanticLevel, options.dotMatchesNewline) {
Expand Down Expand Up @@ -513,7 +539,22 @@ extension Compiler.ByteCodeGen {

case let .quotedLiteral(s):
// TODO: Should this incorporate options?
builder.buildMatchSequence(s)
if options.isCaseInsensitive {
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
builder.buildConsume { input, bounds in
var iterator = s.makeIterator()
var currentIndex = bounds.lowerBound
while let ch = iterator.next() {
guard currentIndex < bounds.upperBound,
ch.lowercased() == input[currentIndex].lowercased()
else { return nil }
input.formIndex(after: &currentIndex)
}
return currentIndex
}
} else {
builder.buildMatchSequence(s)
}

case let .regexLiteral(l):
try emitNode(l.dslTreeNode)
Expand Down
71 changes: 53 additions & 18 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -51,24 +51,34 @@ extension DSLTree.Node {
}

extension DSLTree.Atom {
// TODO: If ByteCodeGen switches first, then this is
// unnecessary...
// TODO: If ByteCodeGen switches first, then this is unnecessary for
// top-level nodes, but it's also invoked for `.atom` members of a custom CC
func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram<String>.ConsumeFunction? {
let isCaseInsensitive = opts.isCaseInsensitive

switch self {

case let .char(c):
// TODO: Match level?
return { input, bounds in
let low = bounds.lowerBound
guard input[low] == c else {
return nil
if isCaseInsensitive && c.isCased {
return input[low].lowercased() == c.lowercased()
? input.index(after: low)
: nil
} else {
return input[low] == c
? input.index(after: low)
: nil
}
return input.index(after: low)
}
case let .scalar(s):
return consumeScalar { $0 == s }
return consumeScalar {
isCaseInsensitive
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
: $0 == s
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still feel like there's quite a bit of duplicated code throughout. Would a helper function, similar to how consumeScalar is a helper function for making predicate-based scalar consumers, alleviate this some? Maybe something that is parameterized over case sensitivity?

}

case .any:
// FIXME: Should this be a total ordering?
Expand Down Expand Up @@ -187,14 +197,30 @@ extension DSLTree.CustomCharacterClass.Member {
throw Unsupported("\(high) in range")
}

return { input, bounds in
// TODO: check for out of bounds?
let curIdx = bounds.lowerBound
if (lhs...rhs).contains(input[curIdx]) {
// TODO: semantic level
return input.index(after: curIdx)
if opts.isCaseInsensitive {
let lhsLower = lhs.lowercased()
let rhsLower = rhs.lowercased()
guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
return { input, bounds in
// TODO: check for out of bounds?
let curIdx = bounds.lowerBound
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
// TODO: semantic level
return input.index(after: curIdx)
}
return nil
}
} else {
guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
return { input, bounds in
// TODO: check for out of bounds?
let curIdx = bounds.lowerBound
if (lhs...rhs).contains(input[curIdx]) {
// TODO: semantic level
return input.index(after: curIdx)
}
return nil
}
return nil
}

case let .custom(ccc):
Expand Down Expand Up @@ -237,11 +263,20 @@ extension DSLTree.CustomCharacterClass.Member {
return rhs(input, bounds)
}
case .quotedLiteral(let s):
return { input, bounds in
guard s.contains(input[bounds.lowerBound]) else {
return nil
if opts.isCaseInsensitive {
return { input, bounds in
guard s.lowercased().contains(input[bounds.lowerBound].lowercased()) else {
return nil
}
return input.index(after: bounds.lowerBound)
}
} else {
return { input, bounds in
guard s.contains(input[bounds.lowerBound]) else {
return nil
}
return input.index(after: bounds.lowerBound)
}
return input.index(after: bounds.lowerBound)
}
case .trivia:
// TODO: Should probably strip this earlier...
Expand Down
9 changes: 8 additions & 1 deletion Sources/_StringProcessing/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct MatchingOptions {
}
}

// Compiler API
// MARK: Compilation API
extension MatchingOptions {
/// Creates an instance with the default options.
init() {
Expand All @@ -51,7 +51,14 @@ extension MatchingOptions {
stack[stack.count - 1].apply(sequence)
_invariantCheck()
}
}

// MARK: Matching behavior API
extension MatchingOptions {
var isCaseInsensitive: Bool {
stack.last!.contains(.caseInsensitive)
}

var isReluctantByDefault: Bool {
stack.last!.contains(.reluctantByDefault)
}
Expand Down
45 changes: 43 additions & 2 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ func matchTest(
syntax: SyntaxOptions = .traditional,
enableTracing: Bool = false,
dumpAST: Bool = false,
xfail: Bool = false
xfail: Bool = false,
file: StaticString = #file,
line: UInt = #line
) {
for (test, expect) in tests {
firstMatchTest(
Expand All @@ -123,7 +125,9 @@ func matchTest(
syntax: syntax,
enableTracing: enableTracing,
dumpAST: dumpAST,
xfail: xfail)
xfail: xfail,
file: file,
line: line)
}
}

Expand Down Expand Up @@ -1117,6 +1121,43 @@ extension RegexTests {
firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb")
}

func testCaseSensitivity() {
matchTest(
#"c..e"#,
("cafe", true),
("Cafe", false))
matchTest(
#"(?i)c.f."#,
("cafe", true),
("Cafe", true),
("caFe", true))
matchTest(
#"(?i)cafe"#,
("cafe", true),
("Cafe", true),
("caFe", true))
matchTest(
#"(?i)café"#,
("café", true),
("CafÉ", true))
matchTest(
#"(?i)\u{63}af\u{e9}"#,
("café", true),
("CafÉ", true))

matchTest(
#"[caFE]{4}"#,
("cafe", false),
("CAFE", false),
("caFE", true),
("EFac", true))
matchTest(
#"(?i)[caFE]{4}"#,
("cafe", true),
("CaFe", true),
("EfAc", true))
}

func testMatchingOptionsScope() {
// `.` only matches newlines when the 's' option (single-line mode)
// is turned on. Standalone option-setting groups (e.g. `(?s)`) are
Expand Down