@@ -751,9 +751,127 @@ fileprivate extension Compiler.ByteCodeGen {
751
751
builder. label ( exit)
752
752
}
753
753
754
+ /// Coalesce any adjacent scalar members in a custom character class together.
755
+ /// This is required in order to produce correct grapheme matching behavior.
756
+ func coalescingCustomCharacterClassMembers(
757
+ _ members: [ DSLTree . CustomCharacterClass . Member ]
758
+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
759
+ struct Accumulator {
760
+ /// A series of range operands. For example, in `[ab-cde-fg]`, this will
761
+ /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
762
+ /// ranges will be created.
763
+ private var rangeOperands : [ String ] = [ " " ]
764
+
765
+ /// The current range operand.
766
+ private var current : String {
767
+ _read { yield rangeOperands [ rangeOperands. count - 1 ] }
768
+ _modify { yield & rangeOperands[ rangeOperands. count - 1 ] }
769
+ }
770
+
771
+ /// Try to accumulate a character class member, returning `true` if
772
+ /// successful, `false` otherwise.
773
+ mutating func tryAccumulate(
774
+ _ member: DSLTree . CustomCharacterClass . Member
775
+ ) -> Bool {
776
+ switch member {
777
+ case . atom( let a) :
778
+ guard let c = a. literalCharacterValue else { return false }
779
+ current. append ( c)
780
+ return true
781
+ case . quotedLiteral( let str) :
782
+ current += str
783
+ return true
784
+ case let . range( lhs, rhs) :
785
+ guard let lhs = lhs. literalCharacterValue,
786
+ let rhs = rhs. literalCharacterValue
787
+ else { return false }
788
+ current. append ( lhs)
789
+ rangeOperands. append ( String ( rhs) )
790
+ return true
791
+ default :
792
+ return false
793
+ }
794
+ }
795
+
796
+ func finish( ) -> [ DSLTree . CustomCharacterClass . Member ] {
797
+ if rangeOperands. count == 1 {
798
+ // If we didn't have any additional range operands, this isn't a
799
+ // range, we can just form a standard quoted literal.
800
+ return [ . quotedLiteral( current) ]
801
+ }
802
+ var members = [ DSLTree . CustomCharacterClass. Member] ( )
803
+
804
+ // We have other range operands, splice them together. For N operands
805
+ // we have N - 1 ranges.
806
+ for (i, lhs) in rangeOperands. dropLast ( ) . enumerated ( ) {
807
+ let rhs = rangeOperands [ i + 1 ]
808
+
809
+ // If this is the first operand we only need to drop the last
810
+ // character for its quoted members, otherwise this is both an LHS
811
+ // and RHS of a range, and as such needs both sides trimmed.
812
+ let leading = i == 0 ? lhs. dropLast ( ) : lhs. dropFirst ( ) . dropLast ( )
813
+ if !leading. isEmpty {
814
+ members. append ( . quotedLiteral( String ( leading) ) )
815
+ }
816
+ members. append ( . range( . char( lhs. last!) , . char( rhs. first!) ) )
817
+ }
818
+ // We've handled everything except the quoted portion of the last
819
+ // operand, add it now.
820
+ let trailing = rangeOperands. last!. dropFirst ( )
821
+ if !trailing. isEmpty {
822
+ members. append ( . quotedLiteral( String ( trailing) ) )
823
+ }
824
+ return members
825
+ }
826
+ }
827
+ return members
828
+ . map { m -> DSLTree . CustomCharacterClass . Member in
829
+ // First we need to recursively coalsce any child character classes.
830
+ switch m {
831
+ case . custom( let ccc) :
832
+ return . custom( coalescingCustomCharacterClass ( ccc) )
833
+ case . intersection( let lhs, let rhs) :
834
+ return . intersection(
835
+ coalescingCustomCharacterClass ( lhs) ,
836
+ coalescingCustomCharacterClass ( rhs) )
837
+ case . subtraction( let lhs, let rhs) :
838
+ return . subtraction(
839
+ coalescingCustomCharacterClass ( lhs) ,
840
+ coalescingCustomCharacterClass ( rhs) )
841
+ case . symmetricDifference( let lhs, let rhs) :
842
+ return . symmetricDifference(
843
+ coalescingCustomCharacterClass ( lhs) ,
844
+ coalescingCustomCharacterClass ( rhs) )
845
+ case . atom, . range, . quotedLiteral, . trivia:
846
+ return m
847
+ }
848
+ }
849
+ . coalescing ( with: Accumulator ( ) , into: { $0. finish ( ) } ) { accum, member in
850
+ accum. tryAccumulate ( member)
851
+ }
852
+ }
853
+
854
+ func coalescingCustomCharacterClass(
855
+ _ ccc: DSLTree . CustomCharacterClass
856
+ ) -> DSLTree . CustomCharacterClass {
857
+ // This only needs to be done in grapheme semantic mode. In scalar semantic
858
+ // mode, we don't want to coalesce any scalars into a grapheme. This
859
+ // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
860
+ // U+302.
861
+ guard options. semanticLevel == . graphemeCluster else { return ccc }
862
+
863
+ let members = coalescingCustomCharacterClassMembers ( ccc. members)
864
+ return . init( members: members, isInverted: ccc. isInverted)
865
+ }
866
+
754
867
mutating func emitCustomCharacterClass(
755
868
_ ccc: DSLTree . CustomCharacterClass
756
869
) throws {
870
+ // Before emitting a custom character class in grapheme semantic mode, we
871
+ // need to coalesce together any adjacent characters and scalars, over which
872
+ // we can perform grapheme breaking. This includes e.g range bounds for
873
+ // `[e\u{301}-\u{302}]`.
874
+ let ccc = coalescingCustomCharacterClass ( ccc)
757
875
if let asciiBitset = ccc. asAsciiBitset ( options) ,
758
876
optimizationsEnabled {
759
877
if options. semanticLevel == . unicodeScalar {
0 commit comments