Skip to content

Commit ebd1297

Browse files
committed
Fix canonical equivalence at different levels
This re-interprets runs of characters and scalars as a quoted literal, and implements the correct semantic level matching for scalars, characters, and quoted literals.
1 parent 9a2d623 commit ebd1297

File tree

4 files changed

+68
-29
lines changed

4 files changed

+68
-29
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,15 @@ extension Compiler.ByteCodeGen {
168168
}
169169

170170
mutating func emitCharacter(_ c: Character) throws {
171-
// FIXME: Does semantic level matter?
171+
// Unicode scalar matches the specific scalars that comprise a character
172+
if options.semanticLevel == .unicodeScalar {
173+
print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars")
174+
for scalar in c.unicodeScalars {
175+
try emitScalar(scalar)
176+
}
177+
return
178+
}
179+
172180
if options.isCaseInsensitive && c.isCased {
173181
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
174182
builder.buildConsume { input, bounds in
@@ -627,22 +635,44 @@ extension Compiler.ByteCodeGen {
627635
try emitAtom(a)
628636

629637
case let .quotedLiteral(s):
630-
// TODO: Should this incorporate options?
631-
if options.isCaseInsensitive {
632-
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
633-
builder.buildConsume { input, bounds in
634-
var iterator = s.makeIterator()
638+
if options.semanticLevel == .graphemeCluster {
639+
if options.isCaseInsensitive {
640+
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
641+
builder.buildConsume { input, bounds in
642+
var iterator = s.makeIterator()
643+
var currentIndex = bounds.lowerBound
644+
while let ch = iterator.next() {
645+
guard currentIndex < bounds.upperBound,
646+
ch.lowercased() == input[currentIndex].lowercased()
647+
else { return nil }
648+
input.formIndex(after: &currentIndex)
649+
}
650+
return currentIndex
651+
}
652+
} else {
653+
builder.buildMatchSequence(s)
654+
}
655+
} else {
656+
builder.buildConsume {
657+
[caseInsensitive = options.isCaseInsensitive] input, bounds in
658+
// TODO: Case folding
659+
var iterator = s.unicodeScalars.makeIterator()
635660
var currentIndex = bounds.lowerBound
636-
while let ch = iterator.next() {
637-
guard currentIndex < bounds.upperBound,
638-
ch.lowercased() == input[currentIndex].lowercased()
639-
else { return nil }
640-
input.formIndex(after: &currentIndex)
661+
while let scalar = iterator.next() {
662+
guard currentIndex < bounds.upperBound else { return nil }
663+
if caseInsensitive {
664+
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
665+
return nil
666+
}
667+
} else {
668+
if scalar != input.unicodeScalars[currentIndex] {
669+
return nil
670+
}
671+
}
672+
input.unicodeScalars.formIndex(after: &currentIndex)
641673
}
642674
return currentIndex
643675
}
644-
} else {
645-
builder.buildMatchSequence(s)
646676
}
647677

648678
case let .regexLiteral(l):

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,13 @@ extension AST.Atom {
131131
}
132132
}
133133

134+
var singleScalar: UnicodeScalar? {
135+
switch kind {
136+
case .scalar(let s): return s
137+
default: return nil
138+
}
139+
}
140+
134141
func generateConsumer(
135142
_ opts: MatchingOptions
136143
) throws -> MEProgram<String>.ConsumeFunction? {

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,17 @@ extension AST.Node {
6565
// TODO: For printing, nice to coalesce
6666
// scalars literals too. We likely need a different
6767
// approach even before we have a better IR.
68-
guard let char = atom?.singleCharacter else {
68+
if let char = atom?.singleCharacter {
69+
result.append(char)
70+
} else if let scalar = atom?.singleScalar {
71+
result.append(Character(scalar))
72+
} else {
6973
break
7074
}
71-
result.append(char)
75+
7276
astChildren.formIndex(after: &idx)
7377
}
74-
return result.count <= 1 ? nil : (idx, result)
78+
return result.isEmpty ? nil : (idx, result)
7579
}
7680

7781
// No need to nest single children concatenations
@@ -207,7 +211,7 @@ extension AST.Atom {
207211

208212
switch self.kind {
209213
case let .char(c): return .char(c)
210-
case let .scalar(s): return .scalar(s)
214+
case let .scalar(s): return .char(Character(s))
211215
case .any: return .any
212216
case let .backreference(r): return .backreference(r)
213217
case let .changeMatchingOptions(seq): return .changeMatchingOptions(seq)

Tests/RegexTests/MatchTests.swift

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -938,15 +938,15 @@ extension RegexTests {
938938

939939
// TODO: Oniguruma \y and \Y
940940
firstMatchTests(
941-
#"\u{65}"#, // Scalar 'e' is present in both:
942-
("Cafe\u{301}", "e"), // composed and
943-
("Sol Cafe", "e")) // standalone
941+
#"\u{65}"#, // Scalar 'e' is present in both
942+
("Cafe\u{301}", nil), // but scalar mode requires boundary at end of match
943+
("Sol Cafe", "e")) // standalone is okay
944944
firstMatchTests(
945945
#"\u{65}\y"#, // Grapheme boundary assertion
946946
("Cafe\u{301}", nil),
947947
("Sol Cafe", "e"))
948948
firstMatchTests(
949-
#"\u{65}\Y"#, // Grapheme non-boundary assertion
949+
#"(?u)\u{65}\Y"#, // Grapheme non-boundary assertion
950950
("Cafe\u{301}", "e"),
951951
("Sol Cafe", nil))
952952
}
@@ -1353,11 +1353,10 @@ extension RegexTests {
13531353
// as a character.
13541354

13551355
firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed)
1356-
// FIXME: Decomposed character in regex literal doesn't match an equivalent character
1357-
firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed,
1358-
xfail: true)
1356+
firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed)
13591357

1360-
firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e")
1358+
firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e",
1359+
xfail: true)
13611360
firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil)
13621361
// FIXME: \y is unsupported
13631362
firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil,
@@ -1381,12 +1380,10 @@ extension RegexTests {
13811380
(eComposed, true),
13821381
(eDecomposed, true))
13831382

1384-
// FIXME: Decomposed character in regex literal doesn't match an equivalent character
13851383
matchTest(
13861384
#"e\u{301}$"#,
13871385
(eComposed, true),
1388-
(eDecomposed, true),
1389-
xfail: true)
1386+
(eDecomposed, true))
13901387

13911388
matchTest(
13921389
#"e$"#,
@@ -1472,7 +1469,8 @@ extension RegexTests {
14721469
firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag)
14731470

14741471
// First Unicode scalar followed by CCC of regional indicators
1475-
firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag)
1472+
firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag,
1473+
xfail: true)
14761474

14771475
// FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character
14781476
// A CCC of regional indicators x 2

0 commit comments

Comments
 (0)