Skip to content

Commit 43ae710

Browse files
authored
Adds support for case insensitivity (#168)
1 parent f173eab commit 43ae710

File tree

4 files changed

+154
-30
lines changed

4 files changed

+154
-30
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,11 @@ extension Compiler.ByteCodeGen {
2020
emitAny()
2121

2222
case let .char(c):
23-
// FIXME: Does semantic level matter?
24-
builder.buildMatch(c)
25-
23+
try emitCharacter(c)
24+
2625
case let .scalar(s):
27-
// TODO: Native instruction
28-
builder.buildConsume(by: consumeScalar {
29-
$0 == s
30-
})
31-
26+
try emitScalar(s)
27+
3228
case let .assertion(kind):
3329
try emitAssertion(kind)
3430

@@ -135,6 +131,36 @@ extension Compiler.ByteCodeGen {
135131
}
136132
}
137133
}
134+
135+
mutating func emitScalar(_ s: UnicodeScalar) throws {
136+
// TODO: Native instruction buildMatchScalar(s)
137+
if options.isCaseInsensitive {
138+
// TODO: e.g. buildCaseInsensitiveMatchScalar(s)
139+
builder.buildConsume(by: consumeScalar {
140+
$0.properties.lowercaseMapping == s.properties.lowercaseMapping
141+
})
142+
} else {
143+
builder.buildConsume(by: consumeScalar {
144+
$0 == s
145+
})
146+
}
147+
}
148+
149+
mutating func emitCharacter(_ c: Character) throws {
150+
// FIXME: Does semantic level matter?
151+
if options.isCaseInsensitive && c.isCased {
152+
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
153+
builder.buildConsume { input, bounds in
154+
let inputChar = input[bounds.lowerBound].lowercased()
155+
let matchChar = c.lowercased()
156+
return inputChar == matchChar
157+
? input.index(after: bounds.lowerBound)
158+
: nil
159+
}
160+
} else {
161+
builder.buildMatch(c)
162+
}
163+
}
138164

139165
mutating func emitAny() {
140166
switch (options.semanticLevel, options.dotMatchesNewline) {
@@ -543,7 +569,22 @@ extension Compiler.ByteCodeGen {
543569

544570
case let .quotedLiteral(s):
545571
// TODO: Should this incorporate options?
546-
builder.buildMatchSequence(s)
572+
if options.isCaseInsensitive {
573+
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
574+
builder.buildConsume { input, bounds in
575+
var iterator = s.makeIterator()
576+
var currentIndex = bounds.lowerBound
577+
while let ch = iterator.next() {
578+
guard currentIndex < bounds.upperBound,
579+
ch.lowercased() == input[currentIndex].lowercased()
580+
else { return nil }
581+
input.formIndex(after: &currentIndex)
582+
}
583+
return currentIndex
584+
}
585+
} else {
586+
builder.buildMatchSequence(s)
587+
}
547588

548589
case let .regexLiteral(l):
549590
try emitNode(l.dslTreeNode)

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 53 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -51,24 +51,34 @@ extension DSLTree.Node {
5151
}
5252

5353
extension DSLTree.Atom {
54-
// TODO: If ByteCodeGen switches first, then this is
55-
// unnecessary...
54+
// TODO: If ByteCodeGen switches first, then this is unnecessary for
55+
// top-level nodes, but it's also invoked for `.atom` members of a custom CC
5656
func generateConsumer(
5757
_ opts: MatchingOptions
5858
) throws -> MEProgram<String>.ConsumeFunction? {
59+
let isCaseInsensitive = opts.isCaseInsensitive
60+
5961
switch self {
60-
6162
case let .char(c):
6263
// TODO: Match level?
6364
return { input, bounds in
6465
let low = bounds.lowerBound
65-
guard input[low] == c else {
66-
return nil
66+
if isCaseInsensitive && c.isCased {
67+
return input[low].lowercased() == c.lowercased()
68+
? input.index(after: low)
69+
: nil
70+
} else {
71+
return input[low] == c
72+
? input.index(after: low)
73+
: nil
6774
}
68-
return input.index(after: low)
6975
}
7076
case let .scalar(s):
71-
return consumeScalar { $0 == s }
77+
return consumeScalar {
78+
isCaseInsensitive
79+
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
80+
: $0 == s
81+
}
7282

7383
case .any:
7484
// FIXME: Should this be a total ordering?
@@ -187,14 +197,30 @@ extension DSLTree.CustomCharacterClass.Member {
187197
throw Unsupported("\(high) in range")
188198
}
189199

190-
return { input, bounds in
191-
// TODO: check for out of bounds?
192-
let curIdx = bounds.lowerBound
193-
if (lhs...rhs).contains(input[curIdx]) {
194-
// TODO: semantic level
195-
return input.index(after: curIdx)
200+
if opts.isCaseInsensitive {
201+
let lhsLower = lhs.lowercased()
202+
let rhsLower = rhs.lowercased()
203+
guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
204+
return { input, bounds in
205+
// TODO: check for out of bounds?
206+
let curIdx = bounds.lowerBound
207+
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
208+
// TODO: semantic level
209+
return input.index(after: curIdx)
210+
}
211+
return nil
212+
}
213+
} else {
214+
guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
215+
return { input, bounds in
216+
// TODO: check for out of bounds?
217+
let curIdx = bounds.lowerBound
218+
if (lhs...rhs).contains(input[curIdx]) {
219+
// TODO: semantic level
220+
return input.index(after: curIdx)
221+
}
222+
return nil
196223
}
197-
return nil
198224
}
199225

200226
case let .custom(ccc):
@@ -237,11 +263,20 @@ extension DSLTree.CustomCharacterClass.Member {
237263
return rhs(input, bounds)
238264
}
239265
case .quotedLiteral(let s):
240-
return { input, bounds in
241-
guard s.contains(input[bounds.lowerBound]) else {
242-
return nil
266+
if opts.isCaseInsensitive {
267+
return { input, bounds in
268+
guard s.lowercased().contains(input[bounds.lowerBound].lowercased()) else {
269+
return nil
270+
}
271+
return input.index(after: bounds.lowerBound)
272+
}
273+
} else {
274+
return { input, bounds in
275+
guard s.contains(input[bounds.lowerBound]) else {
276+
return nil
277+
}
278+
return input.index(after: bounds.lowerBound)
243279
}
244-
return input.index(after: bounds.lowerBound)
245280
}
246281
case .trivia:
247282
// TODO: Should probably strip this earlier...

Sources/_StringProcessing/MatchingOptions.swift

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ struct MatchingOptions {
2525
}
2626
}
2727

28-
// Compiler API
28+
// MARK: Compilation API
2929
extension MatchingOptions {
3030
/// Creates an instance with the default options.
3131
init() {
@@ -51,7 +51,14 @@ extension MatchingOptions {
5151
stack[stack.count - 1].apply(sequence)
5252
_invariantCheck()
5353
}
54+
}
5455

56+
// MARK: Matching behavior API
57+
extension MatchingOptions {
58+
var isCaseInsensitive: Bool {
59+
stack.last!.contains(.caseInsensitive)
60+
}
61+
5562
var isReluctantByDefault: Bool {
5663
stack.last!.contains(.reluctantByDefault)
5764
}

Tests/RegexTests/MatchTests.swift

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,9 @@ func matchTest(
115115
syntax: SyntaxOptions = .traditional,
116116
enableTracing: Bool = false,
117117
dumpAST: Bool = false,
118-
xfail: Bool = false
118+
xfail: Bool = false,
119+
file: StaticString = #file,
120+
line: UInt = #line
119121
) {
120122
for (test, expect) in tests {
121123
firstMatchTest(
@@ -125,7 +127,9 @@ func matchTest(
125127
syntax: syntax,
126128
enableTracing: enableTracing,
127129
dumpAST: dumpAST,
128-
xfail: xfail)
130+
xfail: xfail,
131+
file: file,
132+
line: line)
129133
}
130134
}
131135

@@ -1126,6 +1130,43 @@ extension RegexTests {
11261130
firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb")
11271131
}
11281132

1133+
func testCaseSensitivity() {
1134+
matchTest(
1135+
#"c..e"#,
1136+
("cafe", true),
1137+
("Cafe", false))
1138+
matchTest(
1139+
#"(?i)c.f."#,
1140+
("cafe", true),
1141+
("Cafe", true),
1142+
("caFe", true))
1143+
matchTest(
1144+
#"(?i)cafe"#,
1145+
("cafe", true),
1146+
("Cafe", true),
1147+
("caFe", true))
1148+
matchTest(
1149+
#"(?i)café"#,
1150+
("café", true),
1151+
("CafÉ", true))
1152+
matchTest(
1153+
#"(?i)\u{63}af\u{e9}"#,
1154+
("café", true),
1155+
("CafÉ", true))
1156+
1157+
matchTest(
1158+
#"[caFE]{4}"#,
1159+
("cafe", false),
1160+
("CAFE", false),
1161+
("caFE", true),
1162+
("EFac", true))
1163+
matchTest(
1164+
#"(?i)[caFE]{4}"#,
1165+
("cafe", true),
1166+
("CaFe", true),
1167+
("EfAc", true))
1168+
}
1169+
11291170
func testMatchingOptionsScope() {
11301171
// `.` only matches newlines when the 's' option (single-line mode)
11311172
// is turned on. Standalone option-setting groups (e.g. `(?s)`) are

0 commit comments

Comments
 (0)