Skip to content

Commit 6478466

Browse files
authored
Merge pull request swiftlang#176 from Azoy/scripts-are-here
Implement Script and Extension matching
2 parents 5e2b77c + 0433663 commit 6478466

File tree

6 files changed

+495
-35
lines changed

6 files changed

+495
-35
lines changed

Sources/_CUnicode/Common/ScriptData.h

Lines changed: 335 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "Common/ScriptData.h"
14+
#include "include/UnicodeData.h"
15+
16+
SWIFT_CC
17+
uint8_t _swift_stdlib_getScript(uint32_t scalar) {
18+
int lowerBoundIndex = 0;
19+
int endIndex = SCRIPTS_COUNT;
20+
int upperBoundIndex = endIndex - 1;
21+
22+
while (upperBoundIndex >= lowerBoundIndex) {
23+
int index = lowerBoundIndex + (upperBoundIndex - lowerBoundIndex) / 2;
24+
25+
const uint32_t entry = _swift_stdlib_scripts[index];
26+
27+
// Shift the enum value out of the scalar.
28+
uint32_t lowerBoundScalar = (entry << 11) >> 11;
29+
30+
uint32_t upperBoundScalar = 0;
31+
32+
// If we're not at the end of the array, the range count is simply the
33+
// distance to the next element.
34+
if (index != endIndex - 1) {
35+
const uint32_t nextEntry = _swift_stdlib_scripts[index + 1];
36+
37+
uint32_t nextLower = (nextEntry << 11) >> 11;
38+
39+
upperBoundScalar = nextLower - 1;
40+
} else {
41+
// Otherwise, the range count is the distance to 0x10FFFF
42+
upperBoundScalar = 0x10FFFF;
43+
}
44+
45+
// Shift the scalar out and get the enum value.
46+
uint8_t script = entry >> 21;
47+
48+
if (scalar >= lowerBoundScalar && scalar <= upperBoundScalar) {
49+
return script;
50+
}
51+
52+
if (scalar > upperBoundScalar) {
53+
lowerBoundIndex = index + 1;
54+
continue;
55+
}
56+
57+
if (scalar < lowerBoundScalar) {
58+
upperBoundIndex = index - 1;
59+
continue;
60+
}
61+
}
62+
63+
// If we make it out of this loop, then it means the scalar was not found at
64+
// all in the array. This should never happen because the array represents all
65+
// scalars from 0x0 to 0x10FFFF, but if somehow this branch gets reached,
66+
// return 255 to indicate a failure.
67+
return UINT8_MAX;
68+
}
69+
70+
SWIFT_CC
71+
const uint8_t * const _swift_stdlib_getScriptExtensions(uint32_t scalar,
72+
uint8_t *count) {
73+
intptr_t dataIdx = _swift_stdlib_getScalarBitArrayIdx(scalar,
74+
_swift_stdlib_script_extensions,
75+
_swift_stdlib_script_extensions_ranks);
76+
77+
// If we don't have an index into the data indices, then this scalar has no
78+
// script extensions
79+
if (dataIdx == INTPTR_MAX) {
80+
return 0;
81+
}
82+
83+
uint16_t scalarDataIdx = _swift_stdlib_script_extensions_data_indices[dataIdx];
84+
*count = scalarDataIdx >> 11;
85+
86+
return _swift_stdlib_script_extensions_data + (scalarDataIdx & 0x7FF);
87+
}

Sources/_CUnicode/include/UnicodeData.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,5 +58,14 @@ uint8_t _swift_stdlib_getGraphemeBreakProperty(uint32_t scalar);
5858
SWIFT_CC
5959
_Bool _swift_stdlib_isLinkingConsonant(uint32_t scalar);
6060

61+
//===----------------------------------------------------------------------===//
62+
// Scalar Props
63+
//===----------------------------------------------------------------------===//
64+
65+
SWIFT_CC
66+
uint8_t _swift_stdlib_getScript(uint32_t scalar);
67+
68+
SWIFT_CC
69+
const uint8_t * const _swift_stdlib_getScriptExtensions(uint32_t scalar, uint8_t *count);
6170

6271
#endif // SWIFT_STDLIB_SHIMS_UNICODEDATA_H

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,21 @@ extension AST.CustomCharacterClass {
430430
}
431431

432432
// NOTE: Conveniences, though not most performant
433+
private func consumeScalarScript(
434+
_ s: Unicode.Script
435+
) -> MEProgram<String>.ConsumeFunction {
436+
consumeScalar {
437+
Unicode.Script($0) == s
438+
}
439+
}
440+
private func consumeScalarScriptExtension(
441+
_ s: Unicode.Script
442+
) -> MEProgram<String>.ConsumeFunction {
443+
consumeScalar {
444+
let extensions = Unicode.Script.extensions(for: $0)
445+
return extensions.contains(s)
446+
}
447+
}
433448
private func consumeScalarGC(
434449
_ gc: Unicode.GeneralCategory
435450
) -> MEProgram<String>.ConsumeFunction {
@@ -504,10 +519,10 @@ extension AST.Atom.CharacterProperty {
504519
return value ? cons : invert(cons)
505520

506521
case .script(let s):
507-
throw Unsupported("TODO: Map script: \(s)")
522+
return consumeScalarScript(s)
508523

509524
case .scriptExtension(let s):
510-
throw Unsupported("TODO: Map script: \(s)")
525+
return consumeScalarScriptExtension(s)
511526

512527
case .posix(let p):
513528
return p.generateConsumer(opts)

Sources/_StringProcessing/Unicode/ScalarProps.swift

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,40 @@
99
//
1010
//===----------------------------------------------------------------------===//
1111

12+
@_silgen_name("_swift_stdlib_getScript")
13+
func _swift_stdlib_getScript(_: UInt32) -> UInt8
1214

13-
// TODO
15+
@_silgen_name("_swift_stdlib_getScriptExtensions")
16+
func _swift_stdlib_getScriptExtensions(
17+
_: UInt32,
18+
_: UnsafeMutablePointer<UInt8>
19+
) -> UnsafePointer<UInt8>?
1420

21+
extension Unicode.Script {
22+
init(_ scalar: Unicode.Scalar) {
23+
let rawValue = _swift_stdlib_getScript(scalar.value)
24+
25+
_internalInvariant(rawValue != .max, "Unknown script rawValue: \(rawValue)")
26+
27+
self = unsafeBitCast(rawValue, to: Self.self)
28+
}
29+
30+
static func extensions(for scalar: Unicode.Scalar) -> [Unicode.Script] {
31+
var count: UInt8 = 0
32+
let pointer = _swift_stdlib_getScriptExtensions(scalar.value, &count)
33+
34+
guard let pointer = pointer else {
35+
return [Unicode.Script(scalar)]
36+
}
37+
38+
var result: [Unicode.Script] = []
39+
40+
for i in 0 ..< count {
41+
let script = pointer[Int(i)]
42+
43+
result.append(unsafeBitCast(script, to: Unicode.Script.self))
44+
}
45+
46+
return result
47+
}
48+
}

Tests/RegexTests/MatchTests.swift

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -579,10 +579,7 @@ extension RegexTests {
579579
firstMatchTest("[[:isALNUM:]]", input: "[[:alnum:]]", match: "a")
580580
firstMatchTest("[[:AL_NUM:]]", input: "[[:alnum:]]", match: "a")
581581

582-
// Unfortunately, scripts are not part of stdlib...
583-
firstMatchTest(
584-
"[[:script=Greek:]]", input: "123αβγxyz", match: "α",
585-
xfail: true)
582+
firstMatchTest("[[:script=Greek:]]", input: "123αβγxyz", match: "α")
586583

587584
// MARK: Operators
588585

@@ -683,34 +680,17 @@ extension RegexTests {
683680
firstMatchTest(#"\p{ascii}"#, input: "123abcXYZ", match: "1")
684681
firstMatchTest(#"\p{isAny}"#, input: "123abcXYZ", match: "1")
685682

686-
// Unfortunately, scripts are not part of stdlib...
687-
firstMatchTest(
688-
#"\p{sc=grek}"#, input: "123αβγxyz", match: "α",
689-
xfail: true)
690-
firstMatchTest(
691-
#"\p{sc=isGreek}"#, input: "123αβγxyz", match: "α",
692-
xfail: true)
693-
firstMatchTest(
694-
#"\p{Greek}"#, input: "123αβγxyz", match: "α",
695-
xfail: true)
696-
firstMatchTest(
697-
#"\p{isGreek}"#, input: "123αβγxyz", match: "α",
698-
xfail: true)
699-
firstMatchTest(
700-
#"\P{Script=Latn}"#, input: "abcαβγxyz", match: "α",
701-
xfail: true)
702-
firstMatchTest(
703-
#"\p{script=Greek}"#, input: "123αβγxyz", match: "α",
704-
xfail: true)
705-
firstMatchTest(
706-
#"\p{ISscript=isGreek}"#, input: "123αβγxyz", match: "α",
707-
xfail: true)
708-
firstMatchTest(
709-
#"\p{scx=bamum}"#, input: "123ꚠꚡꚢxyz", match: "",
710-
xfail: true)
711-
firstMatchTest(
712-
#"\p{ISBAMUM}"#, input: "123ꚠꚡꚢxyz", match: "",
713-
xfail: true)
683+
firstMatchTest(#"\p{sc=grek}"#, input: "123αβγxyz", match: "α")
684+
firstMatchTest(#"\p{sc=isGreek}"#, input: "123αβγxyz", match: "α")
685+
firstMatchTest(#"\p{Greek}"#, input: "123αβγxyz", match: "α")
686+
firstMatchTest(#"\p{isGreek}"#, input: "123αβγxyz", match: "α")
687+
firstMatchTest(#"\P{Script=Latn}"#, input: "abcαβγxyz", match: "α")
688+
firstMatchTest(#"\p{script=Greek}"#, input: "123αβγxyz", match: "α")
689+
firstMatchTest(#"\p{ISscript=isGreek}"#, input: "123αβγxyz", match: "α")
690+
firstMatchTest(#"\p{scx=bamum}"#, input: "123ꚠꚡꚢxyz", match: "")
691+
firstMatchTest(#"\p{ISBAMUM}"#, input: "123ꚠꚡꚢxyz", match: "")
692+
firstMatchTest(#"\p{Script=Unknown}"#, input: "\u{10FFFF}", match: "\u{10FFFF}")
693+
firstMatchTest(#"\p{scx=Gujr}"#, input: "\u{a839}", match: "\u{a839}")
714694

715695
firstMatchTest(#"\p{alpha}"#, input: "123abcXYZ", match: "a")
716696
firstMatchTest(#"\P{alpha}"#, input: "123abcXYZ", match: "1")

0 commit comments

Comments
 (0)