Skip to content

Commit d5315d2

Browse files
committed
Coalesce character class members
In grapheme semantic mode, coalesce adjacent character and scalar members of a custom character class, over which we can perform grapheme breaking. This involves potentially re-writing ranges such that they contain a complete grapheme of adjacent scalars.
1 parent 0749de6 commit d5315d2

File tree

4 files changed

+345
-4
lines changed

4 files changed

+345
-4
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,9 +751,119 @@ fileprivate extension Compiler.ByteCodeGen {
751751
builder.label(exit)
752752
}
753753

754+
/// Coalesce any adjacent scalar members in a custom character class together.
755+
/// This is required in order to produce correct grapheme matching behavior.
756+
func coalescingCustomCharacterClassMembers(
757+
_ members: [DSLTree.CustomCharacterClass.Member]
758+
) -> [DSLTree.CustomCharacterClass.Member] {
759+
struct Accumulator {
760+
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
761+
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
762+
/// ranges will be created.
763+
private var rangeOperands: [String] = [""]
764+
765+
/// The current range operand.
766+
private var current: String {
767+
_read { yield rangeOperands[rangeOperands.count - 1] }
768+
_modify { yield &rangeOperands[rangeOperands.count - 1] }
769+
}
770+
771+
/// Try to accumulate a character class member, returning `true` if
772+
/// successful, `false` otherwise.
773+
mutating func tryAccumulate(
774+
_ member: DSLTree.CustomCharacterClass.Member
775+
) -> Bool {
776+
switch member {
777+
case .atom(let a):
778+
guard let c = a.literalCharacterValue else { return false }
779+
current.append(c)
780+
return true
781+
case .quotedLiteral(let str):
782+
current += str
783+
return true
784+
case let .range(lhs, rhs):
785+
guard let lhs = lhs.literalCharacterValue,
786+
let rhs = rhs.literalCharacterValue
787+
else { return false }
788+
current.append(lhs)
789+
rangeOperands.append(String(rhs))
790+
return true
791+
default:
792+
return false
793+
}
794+
}
795+
796+
func finish() -> [DSLTree.CustomCharacterClass.Member] {
797+
if rangeOperands.count == 1 {
798+
// If we didn't have any additional range operands, this isn't a
799+
// range, we can just form a standard quoted literal.
800+
return [.quotedLiteral(current)]
801+
}
802+
// We have other range operands, splice them together.
803+
var members = [DSLTree.CustomCharacterClass.Member]()
804+
for (i, lhs) in rangeOperands.dropLast().enumerated() {
805+
let rhs = rangeOperands[i + 1]
806+
let lhsMembers = lhs.dropLast()
807+
if !lhsMembers.isEmpty {
808+
members.append(.quotedLiteral(String(lhsMembers)))
809+
}
810+
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
811+
let rhsMembers = rhs.dropFirst()
812+
if !rhsMembers.isEmpty {
813+
members.append(.quotedLiteral(String(rhsMembers)))
814+
}
815+
}
816+
return members
817+
}
818+
}
819+
return members
820+
.map { m -> DSLTree.CustomCharacterClass.Member in
821+
// First we need to recursively coalsce any child character classes.
822+
switch m {
823+
case .custom(let ccc):
824+
return .custom(coalescingCustomCharacterClass(ccc))
825+
case .intersection(let lhs, let rhs):
826+
return .intersection(
827+
coalescingCustomCharacterClass(lhs),
828+
coalescingCustomCharacterClass(rhs))
829+
case .subtraction(let lhs, let rhs):
830+
return .subtraction(
831+
coalescingCustomCharacterClass(lhs),
832+
coalescingCustomCharacterClass(rhs))
833+
case .symmetricDifference(let lhs, let rhs):
834+
return .symmetricDifference(
835+
coalescingCustomCharacterClass(lhs),
836+
coalescingCustomCharacterClass(rhs))
837+
case .atom, .range, .quotedLiteral, .trivia:
838+
return m
839+
}
840+
}
841+
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
842+
accum.tryAccumulate(member)
843+
}
844+
}
845+
846+
func coalescingCustomCharacterClass(
847+
_ ccc: DSLTree.CustomCharacterClass
848+
) -> DSLTree.CustomCharacterClass {
849+
// This only needs to be done in grapheme semantic mode. In scalar semantic
850+
// mode, we don't want to coalesce any scalars into a grapheme. This
851+
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
852+
// U+302.
853+
guard options.semanticLevel == .graphemeCluster else { return ccc }
854+
855+
let members = coalescingCustomCharacterClassMembers(ccc.members)
856+
return .init(members: members, isInverted: ccc.isInverted)
857+
}
858+
754859
mutating func emitCustomCharacterClass(
755860
_ ccc: DSLTree.CustomCharacterClass
756861
) throws {
862+
// Before emitting a custom character class in grapheme semantic mode, we
863+
// need to coalesce together any adjacent characters and scalars, over which
864+
// we can perform grapheme breaking. This includes e.g range bounds for
865+
// `[e\u{301}-\u{302}]`.
866+
let ccc = coalescingCustomCharacterClass(ccc)
757867
if let asciiBitset = ccc.asAsciiBitset(options),
758868
optimizationsEnabled {
759869
if options.semanticLevel == .unicodeScalar {

Sources/_StringProcessing/Utility/Misc.swift

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111

1212
extension Array {
1313
/// Coalesce adjacent elements using a given accumulator. The accumulator is
14-
/// transformed into an element of the array by `finish`. The `accumulate`
14+
/// transformed into elements of the array by `finish`. The `accumulate`
1515
/// function should return `true` if the accumulator has coalesced the
1616
/// element, `false` otherwise.
1717
func coalescing<T>(
18-
with initialAccumulator: T, into finish: (T) -> Element,
18+
with initialAccumulator: T, into finish: (T) -> Self,
1919
accumulate: (inout T, Element) -> Bool
2020
) -> Self {
2121
var didAccumulate = false
@@ -32,16 +32,28 @@ extension Array {
3232
if didAccumulate {
3333
// We have a leftover accumulator, which needs to be finished before we
3434
// can append the next element.
35-
result.append(finish(accumulator))
35+
result += finish(accumulator)
3636
accumulator = initialAccumulator
3737
didAccumulate = false
3838
}
3939
result.append(elt)
4040
}
4141
// Handle a leftover accumulation.
4242
if didAccumulate {
43-
result.append(finish(accumulator))
43+
result += finish(accumulator)
4444
}
4545
return result
4646
}
47+
48+
/// Coalesce adjacent elements using a given accumulator. The accumulator is
49+
/// transformed into an element of the array by `finish`. The `accumulate`
50+
/// function should return `true` if the accumulator has coalesced the
51+
/// element, `false` otherwise.
52+
func coalescing<T>(
53+
with initialAccumulator: T, into finish: (T) -> Element,
54+
accumulate: (inout T, Element) -> Bool
55+
) -> Self {
56+
coalescing(
57+
with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate)
58+
}
4759
}

Tests/RegexTests/MatchTests.swift

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,223 @@ extension RegexTests {
731731
semanticLevel: .unicodeScalar
732732
)
733733

734+
// Scalar coalescing.
735+
firstMatchTests(
736+
#"[e\u{301}]"#,
737+
(eDecomposed, eDecomposed),
738+
(eComposed, eComposed),
739+
("e", nil),
740+
("\u{301}", nil)
741+
)
742+
firstMatchTests(
743+
#"[e\u{301}]"#,
744+
(eDecomposed, "e"),
745+
(eComposed, nil),
746+
("e", "e"),
747+
("\u{301}", "\u{301}"),
748+
semanticLevel: .unicodeScalar
749+
)
750+
firstMatchTests(
751+
#"[[[e\u{301}]]]"#,
752+
(eDecomposed, eDecomposed),
753+
(eComposed, eComposed),
754+
("e", nil),
755+
("\u{301}", nil)
756+
)
757+
firstMatchTests(
758+
#"[[[e\u{301}]]]"#,
759+
(eDecomposed, "e"),
760+
(eComposed, nil),
761+
("e", "e"),
762+
("\u{301}", "\u{301}"),
763+
semanticLevel: .unicodeScalar
764+
)
765+
firstMatchTests(
766+
#"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#,
767+
("👨", nil),
768+
("👩", nil),
769+
("👧", nil),
770+
("👦", nil),
771+
("\u{200D}", nil),
772+
("👨‍👩‍👧‍👦", "👨‍👩‍👧‍👦")
773+
)
774+
firstMatchTests(
775+
#"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#,
776+
("👨", "👨"),
777+
("👩", "👩"),
778+
("👧", "👧"),
779+
("👦", "👦"),
780+
("\u{200D}", "\u{200D}"),
781+
("👨‍👩‍👧‍👦", "👨"),
782+
semanticLevel: .unicodeScalar
783+
)
784+
firstMatchTests(
785+
#"[e\u{315}\u{301}\u{35C}]"#,
786+
("e", nil),
787+
("e\u{315}", nil),
788+
("e\u{301}", nil),
789+
("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"),
790+
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
791+
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
792+
)
793+
794+
firstMatchTests(
795+
#"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#,
796+
("a", "a"),
797+
("a\u{301}", "a\u{301}"),
798+
("\u{E1}", "\u{E1}"),
799+
("\u{E2}", nil),
800+
("z", "z"),
801+
("e", "e"),
802+
(eDecomposed, eDecomposed),
803+
(eComposed, eComposed),
804+
("\u{302}", "\u{302}"),
805+
("1", "1"),
806+
("2", nil),
807+
("3", "3"),
808+
("4", "4"),
809+
("5", "5"),
810+
("6", nil),
811+
("7", nil),
812+
("8", nil),
813+
("9", "9")
814+
)
815+
816+
// These can't compile in grapheme semantic mode, but make sure they work in
817+
// scalar semantic mode.
818+
firstMatchTests(
819+
#"[a\u{315}\u{301}-\u{302}]"#,
820+
("a", "a"),
821+
("\u{315}", "\u{315}"),
822+
("\u{301}", "\u{301}"),
823+
("\u{302}", "\u{302}"),
824+
("\u{303}", nil),
825+
semanticLevel: .unicodeScalar
826+
)
827+
firstMatchTests(
828+
#"[\u{73}\u{323}\u{307}-\u{1E00}]"#,
829+
("\u{73}", "\u{73}"),
830+
("\u{323}", "\u{323}"),
831+
("\u{307}", "\u{307}"),
832+
("\u{400}", "\u{400}"),
833+
("\u{500}", "\u{500}"),
834+
("\u{1E00}", "\u{1E00}"),
835+
("\u{1E01}", nil),
836+
("\u{1E69}", nil),
837+
semanticLevel: .unicodeScalar
838+
)
839+
firstMatchTests(
840+
#"[a\u{302}-✅]"#,
841+
("a", "a"),
842+
("\u{302}", "\u{302}"),
843+
("A\u{302}", "\u{302}"),
844+
("E\u{301}", nil),
845+
("a\u{301}", "a"),
846+
("\u{E1}", nil),
847+
("a\u{302}", "a"),
848+
("\u{E2}", nil),
849+
("\u{E3}", nil),
850+
("\u{EF}", nil),
851+
("e\u{301}", nil),
852+
("e\u{302}", "\u{302}"),
853+
("\u{2705}", "\u{2705}"),
854+
("", ""),
855+
("\u{376}", "\u{376}"),
856+
("\u{850}", "\u{850}"),
857+
("a\u{302}\u{315}", "a"),
858+
semanticLevel: .unicodeScalar
859+
)
860+
firstMatchTests(
861+
#"(?i)[a\u{302}-✅]"#,
862+
("a", "a"),
863+
("\u{302}", "\u{302}"),
864+
("A\u{302}", "A"),
865+
("E\u{301}", nil),
866+
("a\u{301}", "a"),
867+
("\u{E1}", nil),
868+
("a\u{302}", "a"),
869+
("\u{E2}", nil),
870+
("\u{E3}", nil),
871+
("\u{EF}", nil),
872+
("e\u{301}", nil),
873+
("e\u{302}", "\u{302}"),
874+
("\u{2705}", "\u{2705}"),
875+
("", ""),
876+
("\u{376}", "\u{376}"),
877+
("\u{850}", "\u{850}"),
878+
("a\u{302}\u{315}", "a"),
879+
semanticLevel: .unicodeScalar
880+
)
881+
firstMatchTests(
882+
#"[e\u{301}-\u{302}]"#,
883+
("a", nil),
884+
("e", "e"),
885+
("\u{302}", "\u{302}"),
886+
("A\u{302}", "\u{302}"),
887+
("E\u{301}", "\u{301}"),
888+
("\u{C8}", nil),
889+
("\u{C9}", nil),
890+
("\u{CA}", nil),
891+
("\u{CB}", nil),
892+
("a\u{301}", "\u{301}"),
893+
("a\u{302}", "\u{302}"),
894+
("e\u{301}", "e"),
895+
("e\u{302}", "e"),
896+
("\u{E1}", nil),
897+
("\u{E2}", nil),
898+
("\u{E9}", nil),
899+
("\u{EA}", nil),
900+
("\u{EF}", nil),
901+
semanticLevel: .unicodeScalar
902+
)
903+
firstMatchTests(
904+
#"(?i)[e\u{301}-\u{302}]"#,
905+
("a", nil),
906+
("e", "e"),
907+
("\u{302}", "\u{302}"),
908+
("A\u{302}", "\u{302}"),
909+
("E\u{301}", "E"),
910+
("\u{C8}", nil),
911+
("\u{C9}", nil),
912+
("\u{CA}", nil),
913+
("\u{CB}", nil),
914+
("a\u{301}", "\u{301}"),
915+
("a\u{302}", "\u{302}"),
916+
("e\u{301}", "e"),
917+
("e\u{302}", "e"),
918+
("\u{E1}", nil),
919+
("\u{E2}", nil),
920+
("\u{E9}", nil),
921+
("\u{EA}", nil),
922+
("\u{EF}", nil),
923+
semanticLevel: .unicodeScalar
924+
)
925+
926+
// Set operation scalar coalescing.
927+
firstMatchTests(
928+
#"[e\u{301}&&e\u{301}e\u{302}]"#,
929+
("e", nil),
930+
("\u{301}", nil),
931+
("\u{302}", nil),
932+
("e\u{301}", "e\u{301}"),
933+
("e\u{302}", nil))
934+
firstMatchTests(
935+
#"[e\u{301}~~[[e\u{301}]e\u{302}]]"#,
936+
("e", nil),
937+
("\u{301}", nil),
938+
("\u{302}", nil),
939+
("e\u{301}", nil),
940+
("e\u{302}", "e\u{302}"))
941+
firstMatchTests(
942+
#"[e\u{301}[e\u{303}]--[[e\u{301}]e\u{302}]]"#,
943+
("e", nil),
944+
("\u{301}", nil),
945+
("\u{302}", nil),
946+
("\u{303}", nil),
947+
("e\u{301}", nil),
948+
("e\u{302}", nil),
949+
("e\u{303}", "e\u{303}"))
950+
734951
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
735952

736953
// These are metacharacters in certain contexts, but normal characters

Tests/RegexTests/ParseTests.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2929,6 +2929,8 @@ extension RegexTests {
29292929
diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b"))
29302930
diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}"))
29312931

2932+
diagnosticTest(#"[e\u{301}-e\u{302}]"#, .invalidCharacterRange(from: "\u{301}", to: "e"))
2933+
29322934
diagnosticTest("(?x)[(?#)]", .expected("]"))
29332935
diagnosticTest("(?x)[(?#abc)]", .expected("]"))
29342936

0 commit comments

Comments
 (0)