Skip to content

Commit 27e6241

Browse files
authored
Merge pull request #41389 from Azoy/fix-indic-sequences
[stdlib] Fix backwards count of Indic graphemes
2 parents fdc04fd + 657c17f commit 27e6241

File tree

6 files changed

+958
-2
lines changed

6 files changed

+958
-2
lines changed

stdlib/private/StdlibUnicodeUnittest/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ add_swift_target_library(swiftStdlibUnicodeUnittest ${SWIFT_STDLIB_LIBRARY_BUILD
66
StdlibUnicodeUnittest.swift
77
Collation.swift
88
UnicodeScalarProperties.swift
9+
GraphemeBreaking.swift
910

1011
SWIFT_MODULE_DEPENDS StdlibUnittest
1112
SWIFT_MODULE_DEPENDS_LINUX Glibc
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
// Normalization tests are currently only avaible on Darwin, awaiting a sensible
14+
// file API...
15+
#if _runtime(_ObjC)
16+
import Foundation
17+
18+
func parseGraphemeBreakTests(
19+
_ data: String,
20+
into result: inout [(String, Int)]
21+
) {
22+
for line in data.split(separator: "\n") {
23+
// Only look at actual tests
24+
guard line.hasPrefix("÷") else {
25+
continue
26+
}
27+
28+
let info = line.split(separator: "#")
29+
let components = info[0].split(separator: " ")
30+
31+
var string = ""
32+
var count = 0
33+
34+
for i in components.indices {
35+
guard i != 0 else {
36+
continue
37+
}
38+
39+
let scalar: Unicode.Scalar
40+
41+
// If we're an odd index, this is a scalar.
42+
if i & 0x1 == 1 {
43+
scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)!
44+
45+
string.unicodeScalars.append(scalar)
46+
} else {
47+
// Otherwise, it is a grapheme breaking operator.
48+
49+
// If this is a break, record the +1 count. Otherwise it is × which is
50+
// not a break.
51+
if components[i] == "÷" {
52+
count += 1
53+
}
54+
}
55+
}
56+
57+
result.append((string, count))
58+
}
59+
}
60+
61+
public let graphemeBreakTests: [(String, Int)] = {
62+
var result: [(String, Int)] = []
63+
64+
let testFile = readInputFile("GraphemeBreakTest.txt")
65+
66+
parseGraphemeBreakTests(testFile, into: &result)
67+
68+
return result
69+
}()
70+
#endif

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,8 @@ extension _StringGuts {
390390
// If we're currently in an indic sequence (or if our lhs is a linking
391391
// consonant), then this check and everything underneath ensures that
392392
// we continue being in one and may check if this extend is a Virama.
393-
if state.isInIndicSequence || scalar1._isLinkingConsonant {
393+
if state.isInIndicSequence ||
394+
(!isBackwards && scalar1._isLinkingConsonant) {
394395
if y == .extend {
395396
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
396397

@@ -440,7 +441,8 @@ extension _StringGuts {
440441
// GB999
441442
default:
442443
// GB9c
443-
if state.isInIndicSequence, state.hasSeenVirama, scalar2._isLinkingConsonant {
444+
if !isBackwards, state.isInIndicSequence, state.hasSeenVirama,
445+
scalar2._isLinkingConsonant {
444446
state.hasSeenVirama = false
445447
return false
446448
}

0 commit comments

Comments
 (0)