Skip to content

Commit 1f74aa1

Browse files
committed
Update grapheme breaking logic to support Unicode 16
1 parent 4e08765 commit 1f74aa1

File tree

1 file changed

+103
-78
lines changed

1 file changed

+103
-78
lines changed

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 103 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -420,11 +420,16 @@ extension _StringGuts {
420420
}
421421

422422
extension Unicode.Scalar {
423-
fileprivate var _isLinkingConsonant: Bool {
424-
_swift_stdlib_isLinkingConsonant(value)
423+
fileprivate var _isInCBConsonant: Bool {
424+
_swift_stdlib_isInCB_Consonant(value)
425425
}
426426

427-
fileprivate var _isVirama: Bool {
427+
fileprivate var _isInCBExtend: Bool {
428+
// Assuming that we're already an Extend or ZWJ...
429+
!(_isInCBConsonant || _isInCBLinker || value == 0x200C)
430+
}
431+
432+
fileprivate var _isInCBLinker: Bool {
428433
switch value {
429434
// Devanagari
430435
case 0x94D:
@@ -453,10 +458,10 @@ extension Unicode.Scalar {
453458

454459
internal struct _GraphemeBreakingState: Sendable, Equatable {
455460
// When we're looking through an indic sequence, one of the requirements is
456-
// that there is at LEAST 1 Virama present between two linking consonants.
461+
// that there is at LEAST 1 InCB=Linker present between two InCB=Consonant.
457462
// This value helps ensure that when we ultimately need to decide whether or
458463
// not to break that we've at least seen 1 when walking.
459-
var hasSeenVirama = false
464+
var hasSeenInCBLinker = false
460465

461466
// When walking forwards in a string, we need to know whether or not we've
462467
// entered an emoji sequence to be able to eventually break after all of the
@@ -483,7 +488,7 @@ internal struct _GraphemeBreakingState: Sendable, Equatable {
483488
extension _GraphemeBreakingState: CustomStringConvertible {
484489
var description: String {
485490
var r = "["
486-
if hasSeenVirama { r += "V" }
491+
if hasSeenInCBLinker { r += "L" }
487492
if isInEmojiSequence { r += "E" }
488493
if isInIndicSequence { r += "I" }
489494
if shouldBreakRI { r += "R" }
@@ -729,8 +734,8 @@ extension _GraphemeBreakingState {
729734
var enterIndicSequence = false
730735

731736
defer {
732-
self.isInEmojiSequence = enterEmojiSequence
733-
self.isInIndicSequence = enterIndicSequence
737+
isInEmojiSequence = enterEmojiSequence
738+
isInIndicSequence = enterIndicSequence
734739
}
735740

736741
let y = Unicode._GraphemeBreakProperty(from: scalar2)
@@ -767,7 +772,7 @@ extension _GraphemeBreakingState {
767772
(.t, .t):
768773
return false
769774

770-
// GB9 (partial GB11)
775+
// GB9 (partial GB9c and partial GB11)
771776
case (_, .extend),
772777
(_, .zwj):
773778

@@ -780,29 +785,52 @@ extension _GraphemeBreakingState {
780785
// sequence; the sequence continues through subsequent extend/extend and
781786
// extend/zwj pairs.
782787
if (
783-
x == .extendedPictographic || (self.isInEmojiSequence && x == .extend)
788+
x == .extendedPictographic || (isInEmojiSequence && x == .extend)
784789
) {
785790
enterEmojiSequence = true
786791
}
787792

788-
// If we're currently in an indic sequence (or if our lhs is a linking
789-
// consonant), then this check and everything underneath ensures that
790-
// we continue being in one and may check if this extend is a Virama.
791-
if self.isInIndicSequence || scalar1._isLinkingConsonant {
792-
if y == .extend {
793-
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
794-
795-
// If our extend's CCC is 0, then this rule does not apply.
796-
guard extendNormData.ccc != 0 else {
797-
return false
798-
}
793+
// GB9c: InCB=Consonant [InCB=Extend InCB=Linker]* InCB=Linker [InCB=Extend InCB=Linker]* × InCB=Consonant
794+
//
795+
// If our lhs is an InCB=Consonant and our rhs is either an InCB=Extend or
796+
// an InCB=Linker, then enter into an indic sequence and mark if scalar 2
797+
// is a linker and that we've seen a linker.
798+
//
799+
// If the lhs is not an InCB=Consonant, then check if we're currently in
800+
// an indic sequence to properly propagate that back to the state.
801+
// Otherwise, we're not in an indic sequence, but our rhs is still an
802+
// extension scalar so don't break regardless right here. If we are in an
803+
// indic sequence, tell the state that we've seen a linker if our rhs is
804+
// one.
805+
switch (scalar1._isInCBConsonant, scalar2._isInCBExtend, scalar2._isInCBLinker) {
806+
// (InCB=Consonant, InCB=Extend)
807+
case (true, true, false):
808+
enterIndicSequence = true
809+
810+
// (InCB=Consonant, InCB=Linker)
811+
case (true, false, true):
812+
enterIndicSequence = true
813+
hasSeenInCBLinker = true
814+
815+
// (_, InCB=Extend)
816+
case (false, true, false):
817+
guard isInIndicSequence else {
818+
break
799819
}
800820

801821
enterIndicSequence = true
802822

803-
if scalar2._isVirama {
804-
self.hasSeenVirama = true
823+
// (_, InCB=Linker)
824+
case (false, false, true):
825+
guard isInIndicSequence else {
826+
break
805827
}
828+
829+
enterIndicSequence = true
830+
hasSeenInCBLinker = true
831+
832+
default:
833+
break
806834
}
807835

808836
return false
@@ -817,25 +845,21 @@ extension _GraphemeBreakingState {
817845

818846
// GB11
819847
case (.zwj, .extendedPictographic):
820-
return !self.isInEmojiSequence
848+
return !isInEmojiSequence
821849

822850
// GB12 & GB13
823851
case (.regionalIndicator, .regionalIndicator):
824852
defer {
825-
self.shouldBreakRI.toggle()
853+
shouldBreakRI.toggle()
826854
}
827855

828-
return self.shouldBreakRI
856+
return shouldBreakRI
829857

830858
// GB999
831859
default:
832860
// GB9c
833-
if
834-
self.isInIndicSequence,
835-
self.hasSeenVirama,
836-
scalar2._isLinkingConsonant
837-
{
838-
self.hasSeenVirama = false
861+
if isInIndicSequence, hasSeenInCBLinker, scalar2._isInCBConsonant {
862+
hasSeenInCBLinker = false
839863
return false
840864
}
841865

@@ -905,7 +929,7 @@ extension _StringGuts {
905929
(.t, .t):
906930
return false
907931

908-
// GB9 (partial GB11)
932+
// GB9
909933
case (_, .extend),
910934
(_, .zwj):
911935
return false
@@ -929,22 +953,19 @@ extension _StringGuts {
929953
// GB999
930954
default:
931955
// GB9c
932-
switch (x, scalar2._isLinkingConsonant) {
933-
case (.extend, true):
934-
let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300)
935-
936-
guard extendNormData.ccc != 0 else {
937-
return true
938-
}
939-
940-
return !checkIfInIndicSequence(at: index, with: previousScalar)
941-
942-
case (.zwj, true):
956+
//
957+
// Check if our rhs is an InCB=Consonant first because we can more easily
958+
// exit out of this branch in most cases. Otherwise, this is a consonant.
959+
// Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
960+
// if it's an .extend or .zwj first because _isInCBExtend assumes that it
961+
// is true).
962+
if scalar2._isInCBConsonant,
963+
(x == .extend || x == .zwj),
964+
(scalar1._isInCBExtend || scalar1._isInCBLinker) {
943965
return !checkIfInIndicSequence(at: index, with: previousScalar)
944-
945-
default:
946-
return true
947966
}
967+
968+
return true
948969
}
949970
}
950971

@@ -1013,69 +1034,73 @@ extension _StringGuts {
10131034
}
10141035

10151036
// When walking backwards, it's impossible to know whether we break when we
1016-
// see our first ((.extend|.zwj), .linkingConsonant) without walking
1017-
// further backwards. This walks the string backwards enough until we figure
1018-
// out whether or not to break this indic sequence. For example:
1037+
// see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
1038+
// without walking further backwards. This walks the string backwards enough
1039+
// until we figure out whether or not to break this indic sequence. For example:
10191040
//
10201041
// Scalar view #1:
10211042
//
1022-
// [.virama, .extend, .linkingConsonant]
1023-
// ^
1024-
// | = To be able to know whether or not to break these
1025-
// two, we need to walk backwards to determine if
1026-
// this is a legitimate indic sequence.
1043+
// [InCB=Linker, InCB=Extend, InCB=Consonant]
1044+
// ^
1045+
// | = To be able to know whether or not to
1046+
// break these two, we need to walk
1047+
// backwards to determine if this is a
1048+
// legitimate indic sequence.
10271049
// ^
1028-
// | = The scalar sequence ends without a starting linking consonant,
1050+
// | = The scalar sequence ends without a starting InCB=Consonant,
10291051
// so this is in fact not an indic sequence, so we can break the two.
10301052
//
10311053
// Scalar view #2:
10321054
//
1033-
// [.linkingConsonant, .virama, .extend, .linkingConsonant]
1034-
// ^
1035-
// | = Same as above
1055+
// [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant]
1056+
// ^
1057+
// | = Same as above
10361058
// ^
1037-
// | = This is a virama, so we at least have seen
1059+
// | = This is a Linker, so we at least have seen
10381060
// 1 to be able to return true if we see a
1039-
// linking consonant later.
1061+
// consonant later.
10401062
// ^
1041-
// | = Is a linking consonant and we've seen a virama, so this is a
1063+
// | = Is a consonant and we've seen a linker, so this is a
10421064
// legitimate indic sequence, so do NOT break the initial question.
10431065
internal func checkIfInIndicSequence(
10441066
at index: Int,
10451067
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
10461068
) -> Bool {
10471069
guard let p = previousScalar(index) else { return false }
10481070

1049-
var hasSeenVirama = p.scalar._isVirama
1071+
var hasSeenInCBLinker = p.scalar._isInCBLinker
10501072
var i = p.start
10511073

10521074
while let (scalar, prev) = previousScalar(i) {
10531075
i = prev
1076+
1077+
if scalar._isInCBConsonant {
1078+
return hasSeenInCBLinker
1079+
}
1080+
10541081
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
10551082

1056-
switch (gbp, scalar._isLinkingConsonant) {
1057-
case (.extend, false):
1058-
let extendNormData = Unicode._NormData(scalar, fastUpperbound: 0x300)
1083+
guard gbp == .extend || gbp == .zwj else {
1084+
return false
1085+
}
10591086

1060-
guard extendNormData.ccc != 0 else {
1061-
return false
1062-
}
1087+
switch (scalar._isInCBExtend, scalar._isInCBLinker) {
1088+
case (false, false):
1089+
return false
10631090

1064-
if scalar._isVirama {
1065-
hasSeenVirama = true
1066-
}
1091+
case (false, true):
1092+
hasSeenInCBLinker = true
10671093

1068-
case (.zwj, false):
1094+
case (true, false):
10691095
continue
10701096

1071-
// LinkingConsonant
1072-
case (_, true):
1073-
return hasSeenVirama
1074-
1075-
default:
1097+
case (true, true):
1098+
// This case should never happen, but if it does then just be cautious
1099+
// and say this is invalid.
10761100
return false
10771101
}
10781102
}
1103+
10791104
return false
10801105
}
10811106

0 commit comments

Comments
 (0)