Skip to content

Commit a3e517e

Browse files
committed
[stdlib] String: Fix forward implementation of grapheme breaking rule 11
Rule GB11 in Unicode Annex 29 is: GB11: Extended_Pictographic Extend* ZWJ × Extended_Pictographic However, our forward grapheme breaking state machine implements it as: GB11: Extended_Pictographic Extend* ZWJ+ × Extended_Pictographic We implement the correct rules when going backward, which can cause String values to have different counts whether we’re going forward or back. The rule as implemented would be fine (Unicode doesn’t care much about the placement of grapheme breaks in invalid sequences), but the directional inconsistency messes with String’s Collection conformance. rdar://104279671
1 parent d70e16b commit a3e517e

File tree

2 files changed

+47
-20
lines changed

2 files changed

+47
-20
lines changed

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -723,11 +723,17 @@ extension _GraphemeBreakingState {
723723
case (_, .extend),
724724
(_, .zwj):
725725

726-
// If we're currently in an emoji sequence, then extends and ZWJ help
727-
// continue the grapheme cluster by combining more scalars later. If we're
728-
// not currently in an emoji sequence, but our lhs scalar is a pictograph,
729-
// then that's a signal that it's the start of an emoji sequence.
730-
if self.isInEmojiSequence || x == .extendedPictographic {
726+
// Prepare for recognizing GB11, by remembering if we're in an emoji
727+
// sequence.
728+
//
729+
// GB11: Extended_Pictographic Extend* ZWJ × Extended_Pictographic
730+
//
731+
// If our left-side scalar is a pictograph, then it starts a new emoji
732+
// sequence; the sequence continues through subsequent extend/extend and
733+
// extend/zwj pairs.
734+
if (
735+
x == .extendedPictographic || (self.isInEmojiSequence && x == .extend)
736+
) {
731737
enterEmojiSequence = true
732738
}
733739

validation-test/stdlib/StringGraphemeBreaking.swift

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,25 @@ extension String {
3737
}
3838
}
3939

40+
func check(
41+
_ string: String,
42+
_ pieces: [[Unicode.Scalar]],
43+
file: String = #file, line: UInt = #line
44+
) {
45+
expectEqual(
46+
string.forwardPieces, pieces,
47+
"string: \(String(reflecting: string)) (forward)",
48+
file: file, line: line)
49+
expectEqual(
50+
string.backwardPieces, pieces,
51+
"string: \(String(reflecting: string)) (backward)",
52+
file: file, line: line)
53+
}
54+
4055
if #available(SwiftStdlib 5.6, *) {
4156
StringGraphemeBreaking.test("grapheme breaking") {
4257
for test in graphemeBreakTests {
43-
expectEqual(
44-
test.string.forwardPieces, test.pieces,
45-
"string: \(String(reflecting: test.string)) (forward)")
46-
expectEqual(
47-
test.string.backwardPieces, test.pieces,
48-
"string: \(String(reflecting: test.string)) (backward)")
58+
check(test.string, test.pieces)
4959
}
5060
}
5161
}
@@ -65,8 +75,8 @@ class NonContiguousNSString: NSString {
6575
super.init()
6676
}
6777

68-
init(_ value: [UInt16]) {
69-
_value = value
78+
init(_ value: some Sequence<UInt16>) {
79+
_value = Array(value)
7080
super.init()
7181
}
7282

@@ -95,16 +105,27 @@ extension _StringGuts {
95105
if #available(SwiftStdlib 5.6, *) {
96106
StringGraphemeBreaking.test("grapheme breaking foreign") {
97107
for test in graphemeBreakTests {
98-
let foreign = NonContiguousNSString(Array(test.string.utf16))
108+
let foreign = NonContiguousNSString(test.string.utf16)
99109
let string = foreign as String
100110

101111
expectTrue(string._guts._isForeign())
102-
expectEqual(
103-
string.forwardPieces, test.pieces,
104-
"string: \(String(reflecting: test.string)) (forward)")
105-
expectEqual(
106-
string.backwardPieces, test.pieces,
107-
"string: \(String(reflecting: test.string)) (backward)")
112+
check(string, test.pieces)
108113
}
109114
}
110115
}
116+
117+
if #available(SwiftStdlib 5.8, *) {
118+
StringGraphemeBreaking.test("GB11") {
119+
// MAN, ZERO WIDTH JOINER, ZERO WIDTH JOINER, GIRL
120+
let string = "\u{1f468}\u{200d}\u{200d}\u{1f467}"
121+
let pieces: [[Unicode.Scalar]] = [
122+
["\u{1f468}", "\u{200d}", "\u{200d}"],
123+
["\u{1f467}"]
124+
]
125+
check(string, pieces)
126+
127+
let foreign = NonContiguousNSString(string.utf16) as String
128+
expectTrue(foreign._guts._isForeign())
129+
check(foreign, pieces)
130+
}
131+
}

0 commit comments

Comments
 (0)