@@ -420,11 +420,16 @@ extension _StringGuts {
420
420
}
421
421
422
422
extension Unicode . Scalar {
423
- fileprivate var _isLinkingConsonant : Bool {
424
- _swift_stdlib_isLinkingConsonant ( value)
423
+ fileprivate var _isInCBConsonant : Bool {
424
+ _swift_stdlib_isInCB_Consonant ( value)
425
425
}
426
426
427
- fileprivate var _isVirama : Bool {
427
+ fileprivate var _isInCBExtend : Bool {
428
+ // Assuming that we're already an Extend or ZWJ...
429
+ !( _isInCBConsonant || _isInCBLinker || value == 0x200C )
430
+ }
431
+
432
+ fileprivate var _isInCBLinker : Bool {
428
433
switch value {
429
434
// Devanagari
430
435
case 0x94D :
@@ -453,10 +458,10 @@ extension Unicode.Scalar {
453
458
454
459
internal struct _GraphemeBreakingState : Sendable , Equatable {
455
460
// When we're looking through an indic sequence, one of the requirements is
456
- // that there is at LEAST 1 Virama present between two linking consonants .
461
+ // that there is at LEAST 1 InCB=Linker present between two InCB=Consonant .
457
462
// This value helps ensure that when we ultimately need to decide whether or
458
463
// not to break that we've at least seen 1 when walking.
459
- var hasSeenVirama = false
464
+ var hasSeenInCBLinker = false
460
465
461
466
// When walking forwards in a string, we need to know whether or not we've
462
467
// entered an emoji sequence to be able to eventually break after all of the
@@ -483,7 +488,7 @@ internal struct _GraphemeBreakingState: Sendable, Equatable {
483
488
extension _GraphemeBreakingState : CustomStringConvertible {
484
489
var description : String {
485
490
var r = " [ "
486
- if hasSeenVirama { r += " V " }
491
+ if hasSeenInCBLinker { r += " L " }
487
492
if isInEmojiSequence { r += " E " }
488
493
if isInIndicSequence { r += " I " }
489
494
if shouldBreakRI { r += " R " }
@@ -729,8 +734,8 @@ extension _GraphemeBreakingState {
729
734
var enterIndicSequence = false
730
735
731
736
defer {
732
- self . isInEmojiSequence = enterEmojiSequence
733
- self . isInIndicSequence = enterIndicSequence
737
+ isInEmojiSequence = enterEmojiSequence
738
+ isInIndicSequence = enterIndicSequence
734
739
}
735
740
736
741
let y = Unicode . _GraphemeBreakProperty ( from: scalar2)
@@ -767,7 +772,7 @@ extension _GraphemeBreakingState {
767
772
( . t, . t) :
768
773
return false
769
774
770
- // GB9 (partial GB11)
775
+ // GB9 (partial GB9c and partial GB11)
771
776
case ( _, . extend) ,
772
777
( _, . zwj) :
773
778
@@ -780,29 +785,52 @@ extension _GraphemeBreakingState {
780
785
// sequence; the sequence continues through subsequent extend/extend and
781
786
// extend/zwj pairs.
782
787
if (
783
- x == . extendedPictographic || ( self . isInEmojiSequence && x == . extend)
788
+ x == . extendedPictographic || ( isInEmojiSequence && x == . extend)
784
789
) {
785
790
enterEmojiSequence = true
786
791
}
787
792
788
- // If we're currently in an indic sequence (or if our lhs is a linking
789
- // consonant), then this check and everything underneath ensures that
790
- // we continue being in one and may check if this extend is a Virama.
791
- if self . isInIndicSequence || scalar1. _isLinkingConsonant {
792
- if y == . extend {
793
- let extendNormData = Unicode . _NormData ( scalar2, fastUpperbound: 0x300 )
794
-
795
- // If our extend's CCC is 0, then this rule does not apply.
796
- guard extendNormData. ccc != 0 else {
797
- return false
798
- }
793
+ // GB9c: InCB=Consonant [InCB=Extend InCB=Linker]* InCB=Linker [InCB=Extend InCB=Linker]* × InCB=Consonant
794
+ //
795
+ // If our lhs is an InCB=Consonant and our rhs is either an InCB=Extend or
796
+ // an InCB=Linker, then enter into an indic sequence and mark if scalar 2
797
+ // is a linker and that we've seen a linker.
798
+ //
799
+ // If the lhs is not an InCB=Consonant, then check if we're currently in
800
+ // an indic sequence to properly propagate that back to the state.
801
+ // Otherwise, we're not in an indic sequence, but our rhs is still an
802
+ // extension scalar so don't break regardless right here. If we are in an
803
+ // indic sequence, tell the state that we've seen a linker if our rhs is
804
+ // one.
805
+ switch ( scalar1. _isInCBConsonant, scalar2. _isInCBExtend, scalar2. _isInCBLinker) {
806
+ // (InCB=Consonant, InCB=Extend)
807
+ case ( true , true , false ) :
808
+ enterIndicSequence = true
809
+
810
+ // (InCB=Consonant, InCB=Linker)
811
+ case ( true , false , true ) :
812
+ enterIndicSequence = true
813
+ hasSeenInCBLinker = true
814
+
815
+ // (_, InCB=Extend)
816
+ case ( false , true , false ) :
817
+ guard isInIndicSequence else {
818
+ break
799
819
}
800
820
801
821
enterIndicSequence = true
802
822
803
- if scalar2. _isVirama {
804
- self . hasSeenVirama = true
823
+ // (_, InCB=Linker)
824
+ case ( false , false , true ) :
825
+ guard isInIndicSequence else {
826
+ break
805
827
}
828
+
829
+ enterIndicSequence = true
830
+ hasSeenInCBLinker = true
831
+
832
+ default :
833
+ break
806
834
}
807
835
808
836
return false
@@ -817,25 +845,21 @@ extension _GraphemeBreakingState {
817
845
818
846
// GB11
819
847
case ( . zwj, . extendedPictographic) :
820
- return !self . isInEmojiSequence
848
+ return !isInEmojiSequence
821
849
822
850
// GB12 & GB13
823
851
case ( . regionalIndicator, . regionalIndicator) :
824
852
defer {
825
- self . shouldBreakRI. toggle ( )
853
+ shouldBreakRI. toggle ( )
826
854
}
827
855
828
- return self . shouldBreakRI
856
+ return shouldBreakRI
829
857
830
858
// GB999
831
859
default :
832
860
// GB9c
833
- if
834
- self . isInIndicSequence,
835
- self . hasSeenVirama,
836
- scalar2. _isLinkingConsonant
837
- {
838
- self . hasSeenVirama = false
861
+ if isInIndicSequence, hasSeenInCBLinker, scalar2. _isInCBConsonant {
862
+ hasSeenInCBLinker = false
839
863
return false
840
864
}
841
865
@@ -905,7 +929,7 @@ extension _StringGuts {
905
929
( . t, . t) :
906
930
return false
907
931
908
- // GB9 (partial GB11)
932
+ // GB9
909
933
case ( _, . extend) ,
910
934
( _, . zwj) :
911
935
return false
@@ -929,22 +953,19 @@ extension _StringGuts {
929
953
// GB999
930
954
default :
931
955
// GB9c
932
- switch ( x, scalar2. _isLinkingConsonant) {
933
- case ( . extend, true ) :
934
- let extendNormData = Unicode . _NormData ( scalar1, fastUpperbound: 0x300 )
935
-
936
- guard extendNormData. ccc != 0 else {
937
- return true
938
- }
939
-
940
- return !checkIfInIndicSequence( at: index, with: previousScalar)
941
-
942
- case ( . zwj, true ) :
956
+ //
957
+ // Check if our rhs is an InCB=Consonant first because we can more easily
958
+ // exit out of this branch in most cases. Otherwise, this is a consonant.
959
+ // Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
960
+ // if it's an .extend or .zwj first because _isInCBExtend assumes that it
961
+ // is true).
962
+ if scalar2. _isInCBConsonant,
963
+ ( x == . extend || x == . zwj) ,
964
+ ( scalar1. _isInCBExtend || scalar1. _isInCBLinker) {
943
965
return !checkIfInIndicSequence( at: index, with: previousScalar)
944
-
945
- default :
946
- return true
947
966
}
967
+
968
+ return true
948
969
}
949
970
}
950
971
@@ -1013,69 +1034,73 @@ extension _StringGuts {
1013
1034
}
1014
1035
1015
1036
// When walking backwards, it's impossible to know whether we break when we
1016
- // see our first ((.extend|.zwj), .linkingConsonant) without walking
1017
- // further backwards. This walks the string backwards enough until we figure
1018
- // out whether or not to break this indic sequence. For example:
1037
+ // see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
1038
+ // without walking further backwards. This walks the string backwards enough
1039
+ // until we figure out whether or not to break this indic sequence. For example:
1019
1040
//
1020
1041
// Scalar view #1:
1021
1042
//
1022
- // [.virama, .extend, .linkingConsonant]
1023
- // ^
1024
- // | = To be able to know whether or not to break these
1025
- // two, we need to walk backwards to determine if
1026
- // this is a legitimate indic sequence.
1043
+ // [InCB=Linker, InCB=Extend, InCB=Consonant]
1044
+ // ^
1045
+ // | = To be able to know whether or not to
1046
+ // break these two, we need to walk
1047
+ // backwards to determine if this is a
1048
+ // legitimate indic sequence.
1027
1049
// ^
1028
- // | = The scalar sequence ends without a starting linking consonant ,
1050
+ // | = The scalar sequence ends without a starting InCB=Consonant ,
1029
1051
// so this is in fact not an indic sequence, so we can break the two.
1030
1052
//
1031
1053
// Scalar view #2:
1032
1054
//
1033
- // [.linkingConsonant, .virama, .extend, .linkingConsonant ]
1034
- // ^
1035
- // | = Same as above
1055
+ // [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant ]
1056
+ // ^
1057
+ // | = Same as above
1036
1058
// ^
1037
- // | = This is a virama , so we at least have seen
1059
+ // | = This is a Linker , so we at least have seen
1038
1060
// 1 to be able to return true if we see a
1039
- // linking consonant later.
1061
+ // consonant later.
1040
1062
// ^
1041
- // | = Is a linking consonant and we've seen a virama , so this is a
1063
+ // | = Is a consonant and we've seen a linker , so this is a
1042
1064
// legitimate indic sequence, so do NOT break the initial question.
1043
1065
internal func checkIfInIndicSequence(
1044
1066
at index: Int ,
1045
1067
with previousScalar: ( Int ) -> ( scalar: Unicode . Scalar , start: Int ) ?
1046
1068
) -> Bool {
1047
1069
guard let p = previousScalar ( index) else { return false }
1048
1070
1049
- var hasSeenVirama = p. scalar. _isVirama
1071
+ var hasSeenInCBLinker = p. scalar. _isInCBLinker
1050
1072
var i = p. start
1051
1073
1052
1074
while let ( scalar, prev) = previousScalar ( i) {
1053
1075
i = prev
1076
+
1077
+ if scalar. _isInCBConsonant {
1078
+ return hasSeenInCBLinker
1079
+ }
1080
+
1054
1081
let gbp = Unicode . _GraphemeBreakProperty ( from: scalar)
1055
1082
1056
- switch ( gbp, scalar . _isLinkingConsonant ) {
1057
- case ( . extend , false ) :
1058
- let extendNormData = Unicode . _NormData ( scalar , fastUpperbound : 0x300 )
1083
+ guard gbp == . extend || gbp == . zwj else {
1084
+ return false
1085
+ }
1059
1086
1060
- guard extendNormData . ccc != 0 else {
1061
- return false
1062
- }
1087
+ switch ( scalar . _isInCBExtend , scalar . _isInCBLinker ) {
1088
+ case ( false , false ) :
1089
+ return false
1063
1090
1064
- if scalar. _isVirama {
1065
- hasSeenVirama = true
1066
- }
1091
+ case ( false , true ) :
1092
+ hasSeenInCBLinker = true
1067
1093
1068
- case ( . zwj , false ) :
1094
+ case ( true , false ) :
1069
1095
continue
1070
1096
1071
- // LinkingConsonant
1072
- case ( _, true ) :
1073
- return hasSeenVirama
1074
-
1075
- default :
1097
+ case ( true , true ) :
1098
+ // This case should never happen, but if it does then just be cautious
1099
+ // and say this is invalid.
1076
1100
return false
1077
1101
}
1078
1102
}
1103
+
1079
1104
return false
1080
1105
}
1081
1106
0 commit comments