Skip to content

Commit 5065b4e

Browse files
committed
Coalesce character class members
In grapheme semantic mode, coalesce adjacent character and scalar members of a custom character class, over which we can perform grapheme breaking. This involves potentially re-writing ranges such that they contain a complete grapheme of adjacent scalars.
1 parent 203868f commit 5065b4e

File tree

4 files changed

+267
-4
lines changed

4 files changed

+267
-4
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,9 +775,90 @@ fileprivate extension Compiler.ByteCodeGen {
775775
builder.label(exit)
776776
}
777777

778+
func coalescingCustomCharacterClassMembers(
779+
_ ccc: DSLTree.CustomCharacterClass
780+
) -> DSLTree.CustomCharacterClass {
781+
// This only needs to be done in grapheme semantic mode. In scalar semantic
782+
// mode, we don't want to coalesce any scalars into a grapheme. This
783+
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
784+
// U+302.
785+
guard options.semanticLevel == .graphemeCluster else { return ccc }
786+
787+
struct Accumulator {
788+
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
789+
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
790+
/// ranges will be created.
791+
private var rangeOperands: [String] = [""]
792+
793+
/// The current range operand.
794+
private var current: String {
795+
_read { yield rangeOperands[rangeOperands.count - 1] }
796+
_modify { yield &rangeOperands[rangeOperands.count - 1] }
797+
}
798+
799+
/// Try to accumulate a character class member, returning `true` if
800+
/// successful, `false` otherwise.
801+
mutating func tryAccumulate(
802+
_ member: DSLTree.CustomCharacterClass.Member
803+
) -> Bool {
804+
switch member {
805+
case .atom(let a):
806+
guard let c = a.literalCharacterValue else { return false }
807+
current.append(c)
808+
return true
809+
case .quotedLiteral(let str):
810+
current += str
811+
return true
812+
case let .range(lhs, rhs):
813+
guard let lhs = lhs.literalCharacterValue,
814+
let rhs = rhs.literalCharacterValue
815+
else { return false }
816+
current.append(lhs)
817+
rangeOperands.append(String(rhs))
818+
return true
819+
default:
820+
return false
821+
}
822+
}
823+
824+
func finish() -> [DSLTree.CustomCharacterClass.Member] {
825+
if rangeOperands.count == 1 {
826+
// If we didn't have any additional range operands, this isn't a
827+
// range, we can just form a standard quoted literal.
828+
return [.quotedLiteral(current)]
829+
}
830+
// We have other range operands, splice them together.
831+
var members = [DSLTree.CustomCharacterClass.Member]()
832+
for (i, lhs) in rangeOperands.dropLast().enumerated() {
833+
let rhs = rangeOperands[i + 1]
834+
let lhsMembers = lhs.dropLast()
835+
if !lhsMembers.isEmpty {
836+
members.append(.quotedLiteral(String(lhsMembers)))
837+
}
838+
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
839+
let rhsMembers = rhs.dropFirst()
840+
if !rhsMembers.isEmpty {
841+
members.append(.quotedLiteral(String(rhsMembers)))
842+
}
843+
}
844+
return members
845+
}
846+
}
847+
let members = ccc.members
848+
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
849+
accum.tryAccumulate(member)
850+
}
851+
return .init(members: members, isInverted: ccc.isInverted)
852+
}
853+
778854
mutating func emitCustomCharacterClass(
779855
_ ccc: DSLTree.CustomCharacterClass
780856
) throws {
857+
// Before emitting a custom character class in grapheme semantic mode, we
858+
// need to coalesce together any adjacent characters and scalars, over which
859+
// we can perform grapheme breaking. This includes e.g range bounds for
860+
// `[e\u{301}-\u{302}]`.
861+
let ccc = coalescingCustomCharacterClassMembers(ccc)
781862
if let asciiBitset = ccc.asAsciiBitset(options),
782863
optimizationsEnabled {
783864
if options.semanticLevel == .unicodeScalar {

Sources/_StringProcessing/Utility/Misc.swift

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111

1212
extension Array {
1313
/// Coalesce adjacent elements using a given accumulator. The accumulator is
14-
/// transformed into an element of the array by `finish`. The `accumulate`
14+
/// transformed into elements of the array by `finish`. The `accumulate`
1515
/// function should return `true` if the accumulator has coalesced the
1616
/// element, `false` otherwise.
1717
func coalescing<T>(
18-
with initialAccumulator: T, into finish: (T) -> Element,
18+
with initialAccumulator: T, into finish: (T) -> Self,
1919
accumulate: (inout T, Element) -> Bool
2020
) -> Self {
2121
var didAccumulate = false
@@ -32,16 +32,28 @@ extension Array {
3232
if didAccumulate {
3333
// We have a leftover accumulator, which needs to be finished before we
3434
// can append the next element.
35-
result.append(finish(accumulator))
35+
result += finish(accumulator)
3636
accumulator = initialAccumulator
3737
didAccumulate = false
3838
}
3939
result.append(elt)
4040
}
4141
// Handle a leftover accumulation.
4242
if didAccumulate {
43-
result.append(finish(accumulator))
43+
result += finish(accumulator)
4444
}
4545
return result
4646
}
47+
48+
/// Coalesce adjacent elements using a given accumulator. The accumulator is
49+
/// transformed into an element of the array by `finish`. The `accumulate`
50+
/// function should return `true` if the accumulator has coalesced the
51+
/// element, `false` otherwise.
52+
func coalescing<T>(
53+
with initialAccumulator: T, into finish: (T) -> Element,
54+
accumulate: (inout T, Element) -> Bool
55+
) -> Self {
56+
coalescing(
57+
with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate)
58+
}
4759
}

Tests/RegexTests/MatchTests.swift

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,174 @@ extension RegexTests {
765765
semanticLevel: .unicodeScalar
766766
)
767767

768+
// Scalar coalescing.
769+
firstMatchTests(
770+
#"[e\u{301}]"#,
771+
(eDecomposed, eDecomposed),
772+
(eComposed, eComposed),
773+
("e", nil),
774+
("\u{301}", nil)
775+
)
776+
firstMatchTests(
777+
#"[e\u{301}]"#,
778+
(eDecomposed, "e"),
779+
(eComposed, nil),
780+
("e", "e"),
781+
("\u{301}", "\u{301}"),
782+
semanticLevel: .unicodeScalar
783+
)
784+
firstMatchTests(
785+
#"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#,
786+
("👨", nil),
787+
("👩", nil),
788+
("👧", nil),
789+
("👦", nil),
790+
("\u{200D}", nil),
791+
("👨‍👩‍👧‍👦", "👨‍👩‍👧‍👦")
792+
)
793+
firstMatchTests(
794+
#"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#,
795+
("👨", "👨"),
796+
("👩", "👩"),
797+
("👧", "👧"),
798+
("👦", "👦"),
799+
("\u{200D}", "\u{200D}"),
800+
("👨‍👩‍👧‍👦", "👨"),
801+
semanticLevel: .unicodeScalar
802+
)
803+
804+
firstMatchTests(
805+
#"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#,
806+
("a", "a"),
807+
("a\u{301}", "a\u{301}"),
808+
("\u{E1}", "\u{E1}"),
809+
("\u{E2}", nil),
810+
("z", "z"),
811+
("e", "e"),
812+
(eDecomposed, eDecomposed),
813+
(eComposed, eComposed),
814+
("\u{302}", "\u{302}"),
815+
("1", "1"),
816+
("2", nil),
817+
("3", "3"),
818+
("4", "4"),
819+
("5", "5"),
820+
("6", nil),
821+
("7", nil),
822+
("8", nil),
823+
("9", "9")
824+
)
825+
826+
// These can't compile in grapheme semantic mode, but make sure they work in
827+
// scalar semantic mode.
828+
firstMatchTests(
829+
#"[a\u{315}\u{301}-\u{302}]"#,
830+
("a", "a"),
831+
("\u{315}", "\u{315}"),
832+
("\u{301}", "\u{301}"),
833+
("\u{302}", "\u{302}"),
834+
("\u{303}", nil),
835+
semanticLevel: .unicodeScalar
836+
)
837+
firstMatchTests(
838+
#"[\u{73}\u{323}\u{307}-\u{1E00}]"#,
839+
("\u{73}", "\u{73}"),
840+
("\u{323}", "\u{323}"),
841+
("\u{307}", "\u{307}"),
842+
("\u{400}", "\u{400}"),
843+
("\u{500}", "\u{500}"),
844+
("\u{1E00}", "\u{1E00}"),
845+
("\u{1E01}", nil),
846+
("\u{1E69}", nil),
847+
semanticLevel: .unicodeScalar
848+
)
849+
firstMatchTests(
850+
#"[a\u{302}-✅]"#,
851+
("a", "a"),
852+
("\u{302}", "\u{302}"),
853+
("A\u{302}", "\u{302}"),
854+
("E\u{301}", nil),
855+
("a\u{301}", "a"),
856+
("\u{E1}", nil),
857+
("a\u{302}", "a"),
858+
("\u{E2}", nil),
859+
("\u{E3}", nil),
860+
("\u{EF}", nil),
861+
("e\u{301}", nil),
862+
("e\u{302}", "\u{302}"),
863+
("\u{2705}", "\u{2705}"),
864+
("", ""),
865+
("\u{376}", "\u{376}"),
866+
("\u{850}", "\u{850}"),
867+
("a\u{302}\u{315}", "a"),
868+
semanticLevel: .unicodeScalar
869+
)
870+
firstMatchTests(
871+
#"(?i)[a\u{302}-✅]"#,
872+
("a", "a"),
873+
("\u{302}", "\u{302}"),
874+
("A\u{302}", "A"),
875+
("E\u{301}", nil),
876+
("a\u{301}", "a"),
877+
("\u{E1}", nil),
878+
("a\u{302}", "a"),
879+
("\u{E2}", nil),
880+
("\u{E3}", nil),
881+
("\u{EF}", nil),
882+
("e\u{301}", nil),
883+
("e\u{302}", "\u{302}"),
884+
("\u{2705}", "\u{2705}"),
885+
("", ""),
886+
("\u{376}", "\u{376}"),
887+
("\u{850}", "\u{850}"),
888+
("a\u{302}\u{315}", "a"),
889+
semanticLevel: .unicodeScalar
890+
)
891+
firstMatchTests(
892+
#"[e\u{301}-\u{302}]"#,
893+
("a", nil),
894+
("e", "e"),
895+
("\u{302}", "\u{302}"),
896+
("A\u{302}", "\u{302}"),
897+
("E\u{301}", "\u{301}"),
898+
("\u{C8}", nil),
899+
("\u{C9}", nil),
900+
("\u{CA}", nil),
901+
("\u{CB}", nil),
902+
("a\u{301}", "\u{301}"),
903+
("a\u{302}", "\u{302}"),
904+
("e\u{301}", "e"),
905+
("e\u{302}", "e"),
906+
("\u{E1}", nil),
907+
("\u{E2}", nil),
908+
("\u{E9}", nil),
909+
("\u{EA}", nil),
910+
("\u{EF}", nil),
911+
semanticLevel: .unicodeScalar
912+
)
913+
firstMatchTests(
914+
#"(?i)[e\u{301}-\u{302}]"#,
915+
("a", nil),
916+
("e", "e"),
917+
("\u{302}", "\u{302}"),
918+
("A\u{302}", "\u{302}"),
919+
("E\u{301}", "E"),
920+
("\u{C8}", nil),
921+
("\u{C9}", nil),
922+
("\u{CA}", nil),
923+
("\u{CB}", nil),
924+
("a\u{301}", "\u{301}"),
925+
("a\u{302}", "\u{302}"),
926+
("e\u{301}", "e"),
927+
("e\u{302}", "e"),
928+
("\u{E1}", nil),
929+
("\u{E2}", nil),
930+
("\u{E9}", nil),
931+
("\u{EA}", nil),
932+
("\u{EF}", nil),
933+
semanticLevel: .unicodeScalar
934+
)
935+
768936
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
769937

770938
// These are metacharacters in certain contexts, but normal characters

Tests/RegexTests/ParseTests.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2938,6 +2938,8 @@ extension RegexTests {
29382938
diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b"))
29392939
diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}"))
29402940

2941+
diagnosticTest(#"[e\u{301}-e\u{302}]"#, .invalidCharacterRange(from: "\u{301}", to: "e"))
2942+
29412943
diagnosticTest("(?x)[(?#)]", .expected("]"))
29422944
diagnosticTest("(?x)[(?#abc)]", .expected("]"))
29432945

0 commit comments

Comments
 (0)