Skip to content

Commit c7168f5

Browse files
committed
Coalesce character class members
In grapheme semantic mode, coalesce adjacent character and scalar members of a custom character class, over which we can perform grapheme breaking. This involves potentially re-writing ranges such that they contain a complete grapheme of adjacent scalars.
1 parent 2095b85 commit c7168f5

File tree

4 files changed

+383
-4
lines changed

4 files changed

+383
-4
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,9 +751,127 @@ fileprivate extension Compiler.ByteCodeGen {
751751
builder.label(exit)
752752
}
753753

754+
/// Coalesce any adjacent scalar members in a custom character class together.
755+
/// This is required in order to produce correct grapheme matching behavior.
756+
func coalescingCustomCharacterClassMembers(
757+
_ members: [DSLTree.CustomCharacterClass.Member]
758+
) -> [DSLTree.CustomCharacterClass.Member] {
759+
struct Accumulator {
760+
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
761+
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
762+
/// ranges will be created.
763+
private var rangeOperands: [String] = [""]
764+
765+
/// The current range operand.
766+
private var current: String {
767+
_read { yield rangeOperands[rangeOperands.count - 1] }
768+
_modify { yield &rangeOperands[rangeOperands.count - 1] }
769+
}
770+
771+
/// Try to accumulate a character class member, returning `true` if
772+
/// successful, `false` otherwise.
773+
mutating func tryAccumulate(
774+
_ member: DSLTree.CustomCharacterClass.Member
775+
) -> Bool {
776+
switch member {
777+
case .atom(let a):
778+
guard let c = a.literalCharacterValue else { return false }
779+
current.append(c)
780+
return true
781+
case .quotedLiteral(let str):
782+
current += str
783+
return true
784+
case let .range(lhs, rhs):
785+
guard let lhs = lhs.literalCharacterValue,
786+
let rhs = rhs.literalCharacterValue
787+
else { return false }
788+
current.append(lhs)
789+
rangeOperands.append(String(rhs))
790+
return true
791+
default:
792+
return false
793+
}
794+
}
795+
796+
func finish() -> [DSLTree.CustomCharacterClass.Member] {
797+
if rangeOperands.count == 1 {
798+
// If we didn't have any additional range operands, this isn't a
799+
// range, we can just form a standard quoted literal.
800+
return [.quotedLiteral(current)]
801+
}
802+
var members = [DSLTree.CustomCharacterClass.Member]()
803+
804+
// We have other range operands, splice them together. For N operands
805+
// we have N - 1 ranges.
806+
for (i, lhs) in rangeOperands.dropLast().enumerated() {
807+
let rhs = rangeOperands[i + 1]
808+
809+
// If this is the first operand we only need to drop the last
810+
// character for its quoted members, otherwise this is both an LHS
811+
// and RHS of a range, and as such needs both sides trimmed.
812+
let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast()
813+
if !leading.isEmpty {
814+
members.append(.quotedLiteral(String(leading)))
815+
}
816+
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
817+
}
818+
// We've handled everything except the quoted portion of the last
819+
// operand, add it now.
820+
let trailing = rangeOperands.last!.dropFirst()
821+
if !trailing.isEmpty {
822+
members.append(.quotedLiteral(String(trailing)))
823+
}
824+
return members
825+
}
826+
}
827+
return members
828+
.map { m -> DSLTree.CustomCharacterClass.Member in
829+
// First we need to recursively coalsce any child character classes.
830+
switch m {
831+
case .custom(let ccc):
832+
return .custom(coalescingCustomCharacterClass(ccc))
833+
case .intersection(let lhs, let rhs):
834+
return .intersection(
835+
coalescingCustomCharacterClass(lhs),
836+
coalescingCustomCharacterClass(rhs))
837+
case .subtraction(let lhs, let rhs):
838+
return .subtraction(
839+
coalescingCustomCharacterClass(lhs),
840+
coalescingCustomCharacterClass(rhs))
841+
case .symmetricDifference(let lhs, let rhs):
842+
return .symmetricDifference(
843+
coalescingCustomCharacterClass(lhs),
844+
coalescingCustomCharacterClass(rhs))
845+
case .atom, .range, .quotedLiteral, .trivia:
846+
return m
847+
}
848+
}
849+
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
850+
accum.tryAccumulate(member)
851+
}
852+
}
853+
854+
func coalescingCustomCharacterClass(
855+
_ ccc: DSLTree.CustomCharacterClass
856+
) -> DSLTree.CustomCharacterClass {
857+
// This only needs to be done in grapheme semantic mode. In scalar semantic
858+
// mode, we don't want to coalesce any scalars into a grapheme. This
859+
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
860+
// U+302.
861+
guard options.semanticLevel == .graphemeCluster else { return ccc }
862+
863+
let members = coalescingCustomCharacterClassMembers(ccc.members)
864+
return .init(members: members, isInverted: ccc.isInverted)
865+
}
866+
754867
mutating func emitCustomCharacterClass(
755868
_ ccc: DSLTree.CustomCharacterClass
756869
) throws {
870+
// Before emitting a custom character class in grapheme semantic mode, we
871+
// need to coalesce together any adjacent characters and scalars, over which
872+
// we can perform grapheme breaking. This includes e.g range bounds for
873+
// `[e\u{301}-\u{302}]`.
874+
let ccc = coalescingCustomCharacterClass(ccc)
757875
if let asciiBitset = ccc.asAsciiBitset(options),
758876
optimizationsEnabled {
759877
if options.semanticLevel == .unicodeScalar {

Sources/_StringProcessing/Utility/Misc.swift

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111

1212
extension Array {
1313
/// Coalesce adjacent elements using a given accumulator. The accumulator is
14-
/// transformed into an element of the array by `finish`. The `accumulate`
14+
/// transformed into elements of the array by `finish`. The `accumulate`
1515
/// function should return `true` if the accumulator has coalesced the
1616
/// element, `false` otherwise.
1717
func coalescing<T>(
18-
with initialAccumulator: T, into finish: (T) -> Element,
18+
with initialAccumulator: T, into finish: (T) -> Self,
1919
accumulate: (inout T, Element) -> Bool
2020
) -> Self {
2121
var didAccumulate = false
@@ -32,16 +32,28 @@ extension Array {
3232
if didAccumulate {
3333
// We have a leftover accumulator, which needs to be finished before we
3434
// can append the next element.
35-
result.append(finish(accumulator))
35+
result += finish(accumulator)
3636
accumulator = initialAccumulator
3737
didAccumulate = false
3838
}
3939
result.append(elt)
4040
}
4141
// Handle a leftover accumulation.
4242
if didAccumulate {
43-
result.append(finish(accumulator))
43+
result += finish(accumulator)
4444
}
4545
return result
4646
}
47+
48+
/// Coalesce adjacent elements using a given accumulator. The accumulator is
49+
/// transformed into an element of the array by `finish`. The `accumulate`
50+
/// function should return `true` if the accumulator has coalesced the
51+
/// element, `false` otherwise.
52+
func coalescing<T>(
53+
with initialAccumulator: T, into finish: (T) -> Element,
54+
accumulate: (inout T, Element) -> Bool
55+
) -> Self {
56+
coalescing(
57+
with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate)
58+
}
4759
}

0 commit comments

Comments
 (0)