Skip to content

[stdlib] Implement the Indic grapheme breaking rules #40746

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions stdlib/public/SwiftShims/UnicodeData.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ __swift_uint32_t _swift_stdlib_getComposition(__swift_uint32_t x,
SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar);

SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar);

//===----------------------------------------------------------------------===//
// Unicode.Scalar.Properties
//===----------------------------------------------------------------------===//
Expand Down
189 changes: 183 additions & 6 deletions stdlib/public/core/StringGraphemeBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
//
//===----------------------------------------------------------------------===//

import SwiftShims

/// CR and LF are common special cases in grapheme breaking logic
private var _CR: UInt8 { return 0x0d }
private var _LF: UInt8 { return 0x0a }
Expand Down Expand Up @@ -175,13 +177,56 @@ extension _StringGuts {
}
}

extension Unicode.Scalar {
fileprivate var _isLinkingConsonant: Bool {
_swift_stdlib_isLinkingConsonant(value)
}

fileprivate var _isVirama: Bool {
switch value {
// Devanagari
case 0x94D:
return true
// Bengali
case 0x9CD:
return true
// Gujarati
case 0xACD:
return true
// Oriya
case 0xB4D:
return true
// Telugu
case 0xC4D:
return true
// Malayalam
case 0xD4D:
return true

default:
return false
}
}
}

internal struct _GraphemeBreakingState {
// When we're looking through an indic sequence, one of the requirements is
// that there is at LEAST 1 Virama present between two linking consonants.
// This value helps ensure that when we ultimately need to decide whether or
// not to break that we've at least seen 1 when walking.
var hasSeenVirama = false

// When walking forwards in a string, we need to know whether or not we've
// entered an emoji sequence to be able to eventually break after all of the
// emoji's various extenders and zero width joiners. This bit allows us to
// keep track of whether or not we're still in an emoji sequence when deciding
// to break.
var isInEmojiSequence: Bool = false
var isInEmojiSequence = false

// Similar to emoji sequences, we need to know not to break an Indic grapheme
// sequence. This sequence is (potentially) composed of many scalars and isn't
// as trivial as comparing two grapheme properties.
var isInIndicSequence = false

// When walking forward in a string, we need to not break on emoji flag
// sequences. Emoji flag sequences are composed of 2 regional indicators, so
Expand All @@ -190,7 +235,7 @@ internal struct _GraphemeBreakingState {
// is another regional indicator, we reach the same decision rule, but in this
// case we actually need to break there's a boundary between emoji flag
// sequences.
var shouldBreakRI: Bool = false
var shouldBreakRI = false
}

extension _StringGuts {
Expand Down Expand Up @@ -288,8 +333,12 @@ extension _StringGuts {
// continue treating the current grapheme cluster as an emoji sequence.
var enterEmojiSequence = false

// Very similar to emoji sequences, but for Indic grapheme sequences.
var enterIndicSequence = false

defer {
state.isInEmojiSequence = enterEmojiSequence
state.isInIndicSequence = enterIndicSequence
}

switch (x, y) {
Expand Down Expand Up @@ -338,6 +387,26 @@ extension _StringGuts {
enterEmojiSequence = true
}

// If we're currently in an indic sequence (or if our lhs is a linking
// consonant), then this check and everything underneath ensures that
// we continue being in one and may check if this extend is a Virama.
if state.isInIndicSequence || scalar1._isLinkingConsonant {
if y == .extend {
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)

// If our extend's CCC is 0, then this rule does not apply.
guard extendNormData.ccc != 0 else {
return false
}
}

enterIndicSequence = true

if scalar2._isVirama {
state.hasSeenVirama = true
}
}

return false

// GB9a
Expand Down Expand Up @@ -370,6 +439,32 @@ extension _StringGuts {

// GB999
default:
// GB9c
if state.isInIndicSequence, state.hasSeenVirama, scalar2._isLinkingConsonant {
state.hasSeenVirama = false
return false
}

// Handle GB9c when walking backwards.
if isBackwards {
switch (x, scalar2._isLinkingConsonant) {
case (.extend, true):
let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300)

guard extendNormData.ccc != 0 else {
return true
}

return !checkIfInIndicSequence(index)

case (.zwj, true):
return !checkIfInIndicSequence(index)

default:
return true
}
}

return true
}
}
Expand Down Expand Up @@ -417,9 +512,7 @@ extension _StringGuts {
// | = We found our starting .extendedPictographic letting us
// know that we are in an emoji sequence so our initial
// break question is answered as NO.
internal func checkIfInEmojiSequence(
_ index: Int
) -> Bool {
internal func checkIfInEmojiSequence(_ index: Int) -> Bool {
var emojiIdx = String.Index(_encodedOffset: index)

guard emojiIdx != startIndex else {
Expand Down Expand Up @@ -447,7 +540,91 @@ extension _StringGuts {

return false
}


// When walking backwards, it's impossible to know whether we break when we
// see our first ((.extend|.zwj), .linkingConsonant) without walking
// further backwards. This walks the string backwards enough until we figure
// out whether or not to break this indic sequence. For example:
//
// Scalar view #1:
//
// [.virama, .extend, .linkingConsonant]
// ^
// | = To be able to know whether or not to break these
// two, we need to walk backwards to determine if
// this is a legitimate indic sequence.
// ^
// | = The scalar sequence ends without a starting linking consonant,
// so this is in fact not an indic sequence, so we can break the two.
//
// Scalar view #2:
//
// [.linkingConsonant, .virama, .extend, .linkingConsonant]
// ^
// | = Same as above
// ^
// | = This is a virama, so we at least have seen
// 1 to be able to return true if we see a
// linking consonant later.
// ^
// | = Is a linking consonant and we've seen a virama, so this is a
// legitimate indic sequence, so do NOT break the initial question.
internal func checkIfInIndicSequence(_ index: Int) -> Bool {
var indicIdx = String.Index(_encodedOffset: index)

guard indicIdx != startIndex else {
return false
}

let scalars = String.UnicodeScalarView(self)
scalars.formIndex(before: &indicIdx)

var hasSeenVirama = false

// Check if the first extend was the Virama.
let scalar = scalars[indicIdx]

if scalar._isVirama {
hasSeenVirama = true
}

while indicIdx != startIndex {
scalars.formIndex(before: &indicIdx)
let scalar = scalars[indicIdx]

let gbp = Unicode._GraphemeBreakProperty(from: scalar)

switch (gbp, scalar._isLinkingConsonant) {
case (.extend, false):
let extendNormData = Unicode._NormData(scalar, fastUpperbound: 0x300)

guard extendNormData.ccc != 0 else {
return false
}

if scalar._isVirama {
hasSeenVirama = true
}

case (.zwj, false):
continue

// LinkingConsonant
case (_, true):
guard hasSeenVirama else {
return false
}

return true

default:
return false
}
}

return false
}

// When walking backwards, it's impossible to know whether we break when we
// see our first (.regionalIndicator, .regionalIndicator) without walking
// further backwards. This walks the string backwards enough until we figure
Expand Down
39 changes: 38 additions & 1 deletion stdlib/public/stubs/Unicode/Common/GraphemeData.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

#define GRAPHEME_BREAK_DATA_COUNT 621

static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
0x3E00000, 0x400007F, 0x800000A9, 0xAD, 0x800000AE, 0x2DE00300, 0x20C00483, 0x25800591,
0x200005BF, 0x202005C1, 0x202005C4, 0x200005C7, 0x40A00600, 0x21400610, 0x61C, 0x2280064B,
0x20000670, 0x20C006D6, 0x400006DD, 0x20A006DF, 0x202006E7, 0x206006EA, 0x4000070F, 0x20000711,
Expand Down Expand Up @@ -101,4 +101,41 @@ static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
0xB701F947, 0x3EE0000, 0x2BEE0020, 0xFEE0080, 0x3DEE0100,
};

static const __swift_uint16_t _swift_stdlib_linkingConsonant_ranks[165] = {
0x0, 0xE, 0xE, 0x12, 0x13, 0x0, 0x0, 0x0, 0x25, 0x35, 0x0, 0x20, 0x25, 0x46, 0x4B, 0x0, 0x17,
0x23, 0x3D, 0x44, 0x0, 0xA, 0x24, 0x31, 0x4B, 0x0, 0x1, 0x27, 0x27, 0x49, 0x0, 0xF, 0x2E, 0x3A,
0x57, 0x0, 0x0, 0x1F, 0x2C, 0x2C, 0x0, 0x21, 0x2D, 0x3C, 0x3C, 0x0, 0x0, 0x0, 0xD, 0x2C, 0x0,
0x2D, 0x30, 0x30, 0x30, 0x0, 0x0, 0x0, 0x1E, 0x31, 0x0, 0x2C, 0x2C, 0x63, 0x72, 0x0, 0x0, 0x0,
0x29, 0x2F, 0x0, 0x26, 0x4A, 0x51, 0x51, 0x0, 0x18, 0x39, 0x54, 0x68, 0x0, 0x18, 0x2F, 0x53, 0x64,
0x0, 0x23, 0x39, 0x69, 0x72, 0x0, 0x0, 0x0, 0xE, 0x18, 0x0, 0x0, 0xF, 0x25, 0x25, 0x0, 0x25, 0x26,
0x49, 0x49, 0x0, 0x19, 0x37, 0x59, 0x61, 0x0, 0xC, 0x24, 0x52, 0x5D, 0x0, 0x8, 0x8, 0x8, 0x2A,
0x0, 0x0, 0x21, 0x21, 0x21, 0x0, 0x2, 0x21, 0x23, 0x43, 0x0, 0x16, 0x22, 0x3D, 0x44, 0x0, 0x0,
0x0, 0x22, 0x22, 0x0, 0x0, 0x0, 0x22, 0x22, 0x0, 0x22, 0x28, 0x4B, 0x73, 0x0, 0x0, 0x21, 0x21,
0x3F, 0x0, 0x0, 0x25, 0x39, 0x43, 0x0, 0x12, 0x12, 0x12, 0x12,
};

static const __swift_uint64_t _swift_stdlib_linkingConsonant[166] = {
0x5, 0x7E0FF00, 0x0, 0x3C0000000, 0x400000000000000, 0x5BFF, 0x0, 0x0, 0x3FFFFFFFFE00000,
0xFF000000FF000000, 0x0, 0x3C5FDFFFFE0, 0x30000B000, 0x36DFDFFFFE0, 0x5E00, 0xFFE0, 0x3EDFDFF,
0xFFE0000002000000, 0xB000000003EDFDFF, 0xD620000000020000, 0xC718, 0x3FF, 0xFDFFFFE000000000,
0x700000003FF, 0xFDFFFFE000000000, 0x3EF, 0x40000000, 0x7FFFFFFFFE00000, 0x0, 0x2FFBFFFFFC000000,
0x7F, 0xFFFE000000000000, 0x7FFFFFFF, 0xF7D6000000000000, 0x7FAFFFFF, 0xF000, 0x0,
0xFFFFFEFF00000000, 0x1FFF, 0x0, 0x0, 0x1FFFFFFFF0000, 0xC0623C0300008000, 0x4003FFE1, 0x0, 0x0,
0x0, 0x0, 0xFFF8000000000000, 0xFFF80003FFF88003, 0x3, 0xFFFFFFFF0001DFF8, 0x7, 0x0, 0x0, 0x0,
0x0, 0x0, 0x7FFFFFFE0000, 0x7FFFF00000000, 0x0, 0xFFFFFFFFFFF, 0x0, 0xFFFFFFFF007FFFFF, 0x181FFF,
0x0, 0x0, 0x0, 0x1FE0000FFFFFFFF8, 0xFC00000000000000, 0xFFFF, 0xFFFFFFFF3800C001,
0xFFFFFFFF0000000F, 0xE0000000000F, 0x0, 0x0, 0xFFFFF78000000000, 0x3FFFFFFF00000007,
0xFFFC00000005FE3C, 0xFFFFF, 0x0, 0x3FFFFFC000000, 0x7FFFFF, 0xFFFFFFFF8E000000,
0xFF9F000000000007, 0x7C00, 0x1FFFFFFFFC0, 0xC40EFFFF00000000, 0xFFFFFFFFFFFF, 0x7FC00000000, 0x0,
0x0, 0x0, 0x3FFF000000000000, 0x7FD, 0x0, 0x0, 0xFEEF000100000000, 0x3FFFFF, 0x0, 0x0,
0xFFFFFFFFF80000, 0x20000000000000, 0xFFFFFFFFE000, 0x0, 0xFF80, 0x900000007FFFFF, 0x7FFFFFFE0,
0x7FFFFFFFE, 0xFF00000000000000, 0xFFFB, 0xFFF, 0xBFFFBD7000000000, 0x7FFFFFFFFC0001FF,
0xFFE0000000000000, 0xFDFF, 0x3ED, 0x0, 0x0, 0xFFFFFFFFC0000000, 0x1F, 0x0, 0xFFFFFFFF8000, 0x0,
0x0, 0x0, 0xC000000000000000, 0x7FFFFFFF, 0xC000000000000000, 0xFFFFFFFF, 0x0, 0xFFFFFC0000000000,
0x10007FF, 0x7FFFFFF00000000, 0x7F00000000, 0x0, 0x0, 0x0, 0xFFFFFFFFC000000, 0x0, 0x0, 0x0, 0x0,
0xFFFFFF6FF000, 0x0, 0x0, 0xFFFFFFFFC0000000, 0xF800000000000001, 0x7FFFFFFFF, 0xFFFFFFFFFF000,
0x0, 0x0, 0x7FFFFFFFC0000000, 0x0, 0xFFFFFFFC, 0x0, 0x0, 0x1FFFFFFFFF000, 0xFFFFF00000000000,
0x3FF, 0x0, 0x3FFFF, 0x0, 0x0, 0x0, 0x0,
};

#endif // #ifndef GRAPHEME_DATA_H
14 changes: 14 additions & 0 deletions stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "Common/GraphemeData.h"
#include "../SwiftShims/UnicodeData.h"
#include <limits>

SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) {
Expand Down Expand Up @@ -57,3 +58,16 @@ __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar)
// property). Return the max value here to indicate .any.
return 0xFF;
}

SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar) {
auto idx = _swift_stdlib_getScalarBitArrayIdx(scalar,
_swift_stdlib_linkingConsonant,
_swift_stdlib_linkingConsonant_ranks);

if (idx == std::numeric_limits<__swift_intptr_t>::max()) {
return false;
}

return true;
}
Loading