Skip to content

Commit 3b48522

Browse files
committed
Merge pull request #40746 from Azoy/indic-grapheme-clusters
[stdlib] Implement the Indic grapheme breaking rules
1 parent bdafd9b commit 3b48522

File tree

7 files changed

+495
-98
lines changed

7 files changed

+495
-98
lines changed

stdlib/public/SwiftShims/UnicodeData.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ __swift_uint32_t _swift_stdlib_getComposition(__swift_uint32_t x,
6262
SWIFT_RUNTIME_STDLIB_INTERNAL
6363
__swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar);
6464

65+
SWIFT_RUNTIME_STDLIB_INTERNAL
66+
__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar);
67+
6568
//===----------------------------------------------------------------------===//
6669
// Unicode.Scalar.Properties
6770
//===----------------------------------------------------------------------===//

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 183 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
import SwiftShims
14+
1315
/// CR and LF are common special cases in grapheme breaking logic
1416
private var _CR: UInt8 { return 0x0d }
1517
private var _LF: UInt8 { return 0x0a }
@@ -175,13 +177,56 @@ extension _StringGuts {
175177
}
176178
}
177179

180+
extension Unicode.Scalar {
181+
fileprivate var _isLinkingConsonant: Bool {
182+
_swift_stdlib_isLinkingConsonant(value)
183+
}
184+
185+
fileprivate var _isVirama: Bool {
186+
switch value {
187+
// Devanagari
188+
case 0x94D:
189+
return true
190+
// Bengali
191+
case 0x9CD:
192+
return true
193+
// Gujarati
194+
case 0xACD:
195+
return true
196+
// Oriya
197+
case 0xB4D:
198+
return true
199+
// Telugu
200+
case 0xC4D:
201+
return true
202+
// Malayalam
203+
case 0xD4D:
204+
return true
205+
206+
default:
207+
return false
208+
}
209+
}
210+
}
211+
178212
internal struct _GraphemeBreakingState {
213+
// When we're looking through an indic sequence, one of the requirements is
214+
// that there is at LEAST 1 Virama present between two linking consonants.
215+
// This value helps ensure that when we ultimately need to decide whether or
216+
// not to break that we've at least seen 1 when walking.
217+
var hasSeenVirama = false
218+
179219
// When walking forwards in a string, we need to know whether or not we've
180220
// entered an emoji sequence to be able to eventually break after all of the
181221
// emoji's various extenders and zero width joiners. This bit allows us to
182222
// keep track of whether or not we're still in an emoji sequence when deciding
183223
// to break.
184-
var isInEmojiSequence: Bool = false
224+
var isInEmojiSequence = false
225+
226+
// Similar to emoji sequences, we need to know not to break an Indic grapheme
227+
// sequence. This sequence is (potentially) composed of many scalars and isn't
228+
// as trivial as comparing two grapheme properties.
229+
var isInIndicSequence = false
185230

186231
// When walking forward in a string, we need to not break on emoji flag
187232
// sequences. Emoji flag sequences are composed of 2 regional indicators, so
@@ -190,7 +235,7 @@ internal struct _GraphemeBreakingState {
190235
// is another regional indicator, we reach the same decision rule, but in this
191236
// case we actually need to break there's a boundary between emoji flag
192237
// sequences.
193-
var shouldBreakRI: Bool = false
238+
var shouldBreakRI = false
194239
}
195240

196241
extension _StringGuts {
@@ -288,8 +333,12 @@ extension _StringGuts {
288333
// continue treating the current grapheme cluster as an emoji sequence.
289334
var enterEmojiSequence = false
290335

336+
// Very similar to emoji sequences, but for Indic grapheme sequences.
337+
var enterIndicSequence = false
338+
291339
defer {
292340
state.isInEmojiSequence = enterEmojiSequence
341+
state.isInIndicSequence = enterIndicSequence
293342
}
294343

295344
switch (x, y) {
@@ -338,6 +387,26 @@ extension _StringGuts {
338387
enterEmojiSequence = true
339388
}
340389

390+
// If we're currently in an indic sequence (or if our lhs is a linking
391+
// consonant), then this check and everything underneath ensures that
392+
// we continue being in one and may check if this extend is a Virama.
393+
if state.isInIndicSequence || scalar1._isLinkingConsonant {
394+
if y == .extend {
395+
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
396+
397+
// If our extend's CCC is 0, then this rule does not apply.
398+
guard extendNormData.ccc != 0 else {
399+
return false
400+
}
401+
}
402+
403+
enterIndicSequence = true
404+
405+
if scalar2._isVirama {
406+
state.hasSeenVirama = true
407+
}
408+
}
409+
341410
return false
342411

343412
// GB9a
@@ -370,6 +439,32 @@ extension _StringGuts {
370439

371440
// GB999
372441
default:
442+
// GB9c
443+
if state.isInIndicSequence, state.hasSeenVirama, scalar2._isLinkingConsonant {
444+
state.hasSeenVirama = false
445+
return false
446+
}
447+
448+
// Handle GB9c when walking backwards.
449+
if isBackwards {
450+
switch (x, scalar2._isLinkingConsonant) {
451+
case (.extend, true):
452+
let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300)
453+
454+
guard extendNormData.ccc != 0 else {
455+
return true
456+
}
457+
458+
return !checkIfInIndicSequence(index)
459+
460+
case (.zwj, true):
461+
return !checkIfInIndicSequence(index)
462+
463+
default:
464+
return true
465+
}
466+
}
467+
373468
return true
374469
}
375470
}
@@ -417,9 +512,7 @@ extension _StringGuts {
417512
// | = We found our starting .extendedPictographic letting us
418513
// know that we are in an emoji sequence so our initial
419514
// break question is answered as NO.
420-
internal func checkIfInEmojiSequence(
421-
_ index: Int
422-
) -> Bool {
515+
internal func checkIfInEmojiSequence(_ index: Int) -> Bool {
423516
var emojiIdx = String.Index(_encodedOffset: index)
424517

425518
guard emojiIdx != startIndex else {
@@ -447,7 +540,91 @@ extension _StringGuts {
447540

448541
return false
449542
}
450-
543+
544+
// When walking backwards, it's impossible to know whether we break when we
545+
// see our first ((.extend|.zwj), .linkingConsonant) without walking
546+
// further backwards. This walks the string backwards enough until we figure
547+
// out whether or not to break this indic sequence. For example:
548+
//
549+
// Scalar view #1:
550+
//
551+
// [.virama, .extend, .linkingConsonant]
552+
// ^
553+
// | = To be able to know whether or not to break these
554+
// two, we need to walk backwards to determine if
555+
// this is a legitimate indic sequence.
556+
// ^
557+
// | = The scalar sequence ends without a starting linking consonant,
558+
// so this is in fact not an indic sequence, so we can break the two.
559+
//
560+
// Scalar view #2:
561+
//
562+
// [.linkingConsonant, .virama, .extend, .linkingConsonant]
563+
// ^
564+
// | = Same as above
565+
// ^
566+
// | = This is a virama, so we at least have seen
567+
// 1 to be able to return true if we see a
568+
// linking consonant later.
569+
// ^
570+
// | = Is a linking consonant and we've seen a virama, so this is a
571+
// legitimate indic sequence, so do NOT break the initial question.
572+
internal func checkIfInIndicSequence(_ index: Int) -> Bool {
573+
var indicIdx = String.Index(_encodedOffset: index)
574+
575+
guard indicIdx != startIndex else {
576+
return false
577+
}
578+
579+
let scalars = String.UnicodeScalarView(self)
580+
scalars.formIndex(before: &indicIdx)
581+
582+
var hasSeenVirama = false
583+
584+
// Check if the first extend was the Virama.
585+
let scalar = scalars[indicIdx]
586+
587+
if scalar._isVirama {
588+
hasSeenVirama = true
589+
}
590+
591+
while indicIdx != startIndex {
592+
scalars.formIndex(before: &indicIdx)
593+
let scalar = scalars[indicIdx]
594+
595+
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
596+
597+
switch (gbp, scalar._isLinkingConsonant) {
598+
case (.extend, false):
599+
let extendNormData = Unicode._NormData(scalar, fastUpperbound: 0x300)
600+
601+
guard extendNormData.ccc != 0 else {
602+
return false
603+
}
604+
605+
if scalar._isVirama {
606+
hasSeenVirama = true
607+
}
608+
609+
case (.zwj, false):
610+
continue
611+
612+
// LinkingConsonant
613+
case (_, true):
614+
guard hasSeenVirama else {
615+
return false
616+
}
617+
618+
return true
619+
620+
default:
621+
return false
622+
}
623+
}
624+
625+
return false
626+
}
627+
451628
// When walking backwards, it's impossible to know whether we break when we
452629
// see our first (.regionalIndicator, .regionalIndicator) without walking
453630
// further backwards. This walks the string backwards enough until we figure

stdlib/public/stubs/Unicode/Common/GraphemeData.h

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
#define GRAPHEME_BREAK_DATA_COUNT 621
2222

23-
static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
23+
static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
2424
0x3E00000, 0x400007F, 0x800000A9, 0xAD, 0x800000AE, 0x2DE00300, 0x20C00483, 0x25800591,
2525
0x200005BF, 0x202005C1, 0x202005C4, 0x200005C7, 0x40A00600, 0x21400610, 0x61C, 0x2280064B,
2626
0x20000670, 0x20C006D6, 0x400006DD, 0x20A006DF, 0x202006E7, 0x206006EA, 0x4000070F, 0x20000711,
@@ -101,4 +101,41 @@ static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
101101
0xB701F947, 0x3EE0000, 0x2BEE0020, 0xFEE0080, 0x3DEE0100,
102102
};
103103

104+
static const __swift_uint16_t _swift_stdlib_linkingConsonant_ranks[165] = {
105+
0x0, 0xE, 0xE, 0x12, 0x13, 0x0, 0x0, 0x0, 0x25, 0x35, 0x0, 0x20, 0x25, 0x46, 0x4B, 0x0, 0x17,
106+
0x23, 0x3D, 0x44, 0x0, 0xA, 0x24, 0x31, 0x4B, 0x0, 0x1, 0x27, 0x27, 0x49, 0x0, 0xF, 0x2E, 0x3A,
107+
0x57, 0x0, 0x0, 0x1F, 0x2C, 0x2C, 0x0, 0x21, 0x2D, 0x3C, 0x3C, 0x0, 0x0, 0x0, 0xD, 0x2C, 0x0,
108+
0x2D, 0x30, 0x30, 0x30, 0x0, 0x0, 0x0, 0x1E, 0x31, 0x0, 0x2C, 0x2C, 0x63, 0x72, 0x0, 0x0, 0x0,
109+
0x29, 0x2F, 0x0, 0x26, 0x4A, 0x51, 0x51, 0x0, 0x18, 0x39, 0x54, 0x68, 0x0, 0x18, 0x2F, 0x53, 0x64,
110+
0x0, 0x23, 0x39, 0x69, 0x72, 0x0, 0x0, 0x0, 0xE, 0x18, 0x0, 0x0, 0xF, 0x25, 0x25, 0x0, 0x25, 0x26,
111+
0x49, 0x49, 0x0, 0x19, 0x37, 0x59, 0x61, 0x0, 0xC, 0x24, 0x52, 0x5D, 0x0, 0x8, 0x8, 0x8, 0x2A,
112+
0x0, 0x0, 0x21, 0x21, 0x21, 0x0, 0x2, 0x21, 0x23, 0x43, 0x0, 0x16, 0x22, 0x3D, 0x44, 0x0, 0x0,
113+
0x0, 0x22, 0x22, 0x0, 0x0, 0x0, 0x22, 0x22, 0x0, 0x22, 0x28, 0x4B, 0x73, 0x0, 0x0, 0x21, 0x21,
114+
0x3F, 0x0, 0x0, 0x25, 0x39, 0x43, 0x0, 0x12, 0x12, 0x12, 0x12,
115+
};
116+
117+
static const __swift_uint64_t _swift_stdlib_linkingConsonant[166] = {
118+
0x5, 0x7E0FF00, 0x0, 0x3C0000000, 0x400000000000000, 0x5BFF, 0x0, 0x0, 0x3FFFFFFFFE00000,
119+
0xFF000000FF000000, 0x0, 0x3C5FDFFFFE0, 0x30000B000, 0x36DFDFFFFE0, 0x5E00, 0xFFE0, 0x3EDFDFF,
120+
0xFFE0000002000000, 0xB000000003EDFDFF, 0xD620000000020000, 0xC718, 0x3FF, 0xFDFFFFE000000000,
121+
0x700000003FF, 0xFDFFFFE000000000, 0x3EF, 0x40000000, 0x7FFFFFFFFE00000, 0x0, 0x2FFBFFFFFC000000,
122+
0x7F, 0xFFFE000000000000, 0x7FFFFFFF, 0xF7D6000000000000, 0x7FAFFFFF, 0xF000, 0x0,
123+
0xFFFFFEFF00000000, 0x1FFF, 0x0, 0x0, 0x1FFFFFFFF0000, 0xC0623C0300008000, 0x4003FFE1, 0x0, 0x0,
124+
0x0, 0x0, 0xFFF8000000000000, 0xFFF80003FFF88003, 0x3, 0xFFFFFFFF0001DFF8, 0x7, 0x0, 0x0, 0x0,
125+
0x0, 0x0, 0x7FFFFFFE0000, 0x7FFFF00000000, 0x0, 0xFFFFFFFFFFF, 0x0, 0xFFFFFFFF007FFFFF, 0x181FFF,
126+
0x0, 0x0, 0x0, 0x1FE0000FFFFFFFF8, 0xFC00000000000000, 0xFFFF, 0xFFFFFFFF3800C001,
127+
0xFFFFFFFF0000000F, 0xE0000000000F, 0x0, 0x0, 0xFFFFF78000000000, 0x3FFFFFFF00000007,
128+
0xFFFC00000005FE3C, 0xFFFFF, 0x0, 0x3FFFFFC000000, 0x7FFFFF, 0xFFFFFFFF8E000000,
129+
0xFF9F000000000007, 0x7C00, 0x1FFFFFFFFC0, 0xC40EFFFF00000000, 0xFFFFFFFFFFFF, 0x7FC00000000, 0x0,
130+
0x0, 0x0, 0x3FFF000000000000, 0x7FD, 0x0, 0x0, 0xFEEF000100000000, 0x3FFFFF, 0x0, 0x0,
131+
0xFFFFFFFFF80000, 0x20000000000000, 0xFFFFFFFFE000, 0x0, 0xFF80, 0x900000007FFFFF, 0x7FFFFFFE0,
132+
0x7FFFFFFFE, 0xFF00000000000000, 0xFFFB, 0xFFF, 0xBFFFBD7000000000, 0x7FFFFFFFFC0001FF,
133+
0xFFE0000000000000, 0xFDFF, 0x3ED, 0x0, 0x0, 0xFFFFFFFFC0000000, 0x1F, 0x0, 0xFFFFFFFF8000, 0x0,
134+
0x0, 0x0, 0xC000000000000000, 0x7FFFFFFF, 0xC000000000000000, 0xFFFFFFFF, 0x0, 0xFFFFFC0000000000,
135+
0x10007FF, 0x7FFFFFF00000000, 0x7F00000000, 0x0, 0x0, 0x0, 0xFFFFFFFFC000000, 0x0, 0x0, 0x0, 0x0,
136+
0xFFFFFF6FF000, 0x0, 0x0, 0xFFFFFFFFC0000000, 0xF800000000000001, 0x7FFFFFFFF, 0xFFFFFFFFFF000,
137+
0x0, 0x0, 0x7FFFFFFFC0000000, 0x0, 0xFFFFFFFC, 0x0, 0x0, 0x1FFFFFFFFF000, 0xFFFFF00000000000,
138+
0x3FF, 0x0, 0x3FFFF, 0x0, 0x0, 0x0, 0x0,
139+
};
140+
104141
#endif // #ifndef GRAPHEME_DATA_H

stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "Common/GraphemeData.h"
1414
#include "../SwiftShims/UnicodeData.h"
15+
#include <limits>
1516

1617
SWIFT_RUNTIME_STDLIB_INTERNAL
1718
__swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) {
@@ -57,3 +58,16 @@ __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar)
5758
// property). Return the max value here to indicate .any.
5859
return 0xFF;
5960
}
61+
62+
SWIFT_RUNTIME_STDLIB_INTERNAL
63+
__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar) {
64+
auto idx = _swift_stdlib_getScalarBitArrayIdx(scalar,
65+
_swift_stdlib_linkingConsonant,
66+
_swift_stdlib_linkingConsonant_ranks);
67+
68+
if (idx == std::numeric_limits<__swift_intptr_t>::max()) {
69+
return false;
70+
}
71+
72+
return true;
73+
}

0 commit comments

Comments
 (0)