Skip to content

Implement Script and Extension matching #176

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 335 additions & 0 deletions Sources/_CUnicode/Common/ScriptData.h

Large diffs are not rendered by default.

87 changes: 87 additions & 0 deletions Sources/_CUnicode/UnicodeScalarProps.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#include "Common/ScriptData.h"
#include "include/UnicodeData.h"

SWIFT_CC
uint8_t _swift_stdlib_getScript(uint32_t scalar) {
int lowerBoundIndex = 0;
int endIndex = SCRIPTS_COUNT;
int upperBoundIndex = endIndex - 1;

while (upperBoundIndex >= lowerBoundIndex) {
int index = lowerBoundIndex + (upperBoundIndex - lowerBoundIndex) / 2;

const uint32_t entry = _swift_stdlib_scripts[index];

// Shift the enum value out of the scalar.
uint32_t lowerBoundScalar = (entry << 11) >> 11;

uint32_t upperBoundScalar = 0;

// If we're not at the end of the array, the range count is simply the
// distance to the next element.
if (index != endIndex - 1) {
const uint32_t nextEntry = _swift_stdlib_scripts[index + 1];

uint32_t nextLower = (nextEntry << 11) >> 11;

upperBoundScalar = nextLower - 1;
} else {
// Otherwise, the range count is the distance to 0x10FFFF
upperBoundScalar = 0x10FFFF;
}

// Shift the scalar out and get the enum value.
uint8_t script = entry >> 21;

if (scalar >= lowerBoundScalar && scalar <= upperBoundScalar) {
return script;
}

if (scalar > upperBoundScalar) {
lowerBoundIndex = index + 1;
continue;
}

if (scalar < lowerBoundScalar) {
upperBoundIndex = index - 1;
continue;
}
}

// If we make it out of this loop, then it means the scalar was not found at
// all in the array. This should never happen because the array represents all
// scalars from 0x0 to 0x10FFFF, but if somehow this branch gets reached,
// return 255 to indicate a failure.
return UINT8_MAX;
}

SWIFT_CC
const uint8_t * const _swift_stdlib_getScriptExtensions(uint32_t scalar,
uint8_t *count) {
intptr_t dataIdx = _swift_stdlib_getScalarBitArrayIdx(scalar,
_swift_stdlib_script_extensions,
_swift_stdlib_script_extensions_ranks);

// If we don't have an index into the data indices, then this scalar has no
// script extensions
if (dataIdx == INTPTR_MAX) {
return 0;
}

uint16_t scalarDataIdx = _swift_stdlib_script_extensions_data_indices[dataIdx];
*count = scalarDataIdx >> 11;

return _swift_stdlib_script_extensions_data + (scalarDataIdx & 0x7FF);
}
9 changes: 9 additions & 0 deletions Sources/_CUnicode/include/UnicodeData.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,14 @@ uint8_t _swift_stdlib_getGraphemeBreakProperty(uint32_t scalar);
SWIFT_CC
_Bool _swift_stdlib_isLinkingConsonant(uint32_t scalar);

//===----------------------------------------------------------------------===//
// Scalar Props
//===----------------------------------------------------------------------===//

SWIFT_CC
uint8_t _swift_stdlib_getScript(uint32_t scalar);

SWIFT_CC
const uint8_t * const _swift_stdlib_getScriptExtensions(uint32_t scalar, uint8_t *count);

#endif // SWIFT_STDLIB_SHIMS_UNICODEDATA_H
19 changes: 17 additions & 2 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,21 @@ extension AST.CustomCharacterClass {
}

// NOTE: Conveniences, though not most performant
private func consumeScalarScript(
_ s: Unicode.Script
) -> MEProgram<String>.ConsumeFunction {
consumeScalar {
Unicode.Script($0) == s
}
}
private func consumeScalarScriptExtension(
_ s: Unicode.Script
) -> MEProgram<String>.ConsumeFunction {
consumeScalar {
let extensions = Unicode.Script.extensions(for: $0)
return extensions.contains(s)
}
}
private func consumeScalarGC(
_ gc: Unicode.GeneralCategory
) -> MEProgram<String>.ConsumeFunction {
Expand Down Expand Up @@ -469,10 +484,10 @@ extension AST.Atom.CharacterProperty {
return value ? cons : invert(cons)

case .script(let s):
throw Unsupported("TODO: Map script: \(s)")
return consumeScalarScript(s)

case .scriptExtension(let s):
throw Unsupported("TODO: Map script: \(s)")
return consumeScalarScriptExtension(s)

case .posix(let p):
return p.generateConsumer(opts)
Expand Down
36 changes: 35 additions & 1 deletion Sources/_StringProcessing/Unicode/ScalarProps.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,40 @@
//
//===----------------------------------------------------------------------===//

@_silgen_name("_swift_stdlib_getScript")
func _swift_stdlib_getScript(_: UInt32) -> UInt8

// TODO
@_silgen_name("_swift_stdlib_getScriptExtensions")
func _swift_stdlib_getScriptExtensions(
_: UInt32,
_: UnsafeMutablePointer<UInt8>
) -> UnsafePointer<UInt8>?

extension Unicode.Script {
init(_ scalar: Unicode.Scalar) {
let rawValue = _swift_stdlib_getScript(scalar.value)

_internalInvariant(rawValue != .max, "Unknown script rawValue: \(rawValue)")

self = unsafeBitCast(rawValue, to: Self.self)
}

static func extensions(for scalar: Unicode.Scalar) -> [Unicode.Script] {
var count: UInt8 = 0
let pointer = _swift_stdlib_getScriptExtensions(scalar.value, &count)

guard let pointer = pointer else {
return [Unicode.Script(scalar)]
}

var result: [Unicode.Script] = []

for i in 0 ..< count {
let script = pointer[Int(i)]

result.append(unsafeBitCast(script, to: Unicode.Script.self))
}

return result
}
}
44 changes: 12 additions & 32 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -573,10 +573,7 @@ extension RegexTests {
firstMatchTest("[[:isALNUM:]]", input: "[[:alnum:]]", match: "a")
firstMatchTest("[[:AL_NUM:]]", input: "[[:alnum:]]", match: "a")

// Unfortunately, scripts are not part of stdlib...
firstMatchTest(
"[[:script=Greek:]]", input: "123αβγxyz", match: "α",
xfail: true)
firstMatchTest("[[:script=Greek:]]", input: "123αβγxyz", match: "α")

// MARK: Operators

Expand Down Expand Up @@ -677,34 +674,17 @@ extension RegexTests {
firstMatchTest(#"\p{ascii}"#, input: "123abcXYZ", match: "1")
firstMatchTest(#"\p{isAny}"#, input: "123abcXYZ", match: "1")

// Unfortunately, scripts are not part of stdlib...
firstMatchTest(
#"\p{sc=grek}"#, input: "123αβγxyz", match: "α",
xfail: true)
firstMatchTest(
#"\p{sc=isGreek}"#, input: "123αβγxyz", match: "α",
xfail: true)
firstMatchTest(
#"\p{Greek}"#, input: "123αβγxyz", match: "α",
xfail: true)
firstMatchTest(
#"\p{isGreek}"#, input: "123αβγxyz", match: "α",
xfail: true)
firstMatchTest(
#"\P{Script=Latn}"#, input: "abcαβγxyz", match: "α",
xfail: true)
firstMatchTest(
#"\p{script=Greek}"#, input: "123αβγxyz", match: "α",
xfail: true)
firstMatchTest(
#"\p{ISscript=isGreek}"#, input: "123αβγxyz", match: "α",
xfail: true)
firstMatchTest(
#"\p{scx=bamum}"#, input: "123ꚠꚡꚢxyz", match: "ꚠ",
xfail: true)
firstMatchTest(
#"\p{ISBAMUM}"#, input: "123ꚠꚡꚢxyz", match: "ꚠ",
xfail: true)
firstMatchTest(#"\p{sc=grek}"#, input: "123αβγxyz", match: "α")
firstMatchTest(#"\p{sc=isGreek}"#, input: "123αβγxyz", match: "α")
firstMatchTest(#"\p{Greek}"#, input: "123αβγxyz", match: "α")
firstMatchTest(#"\p{isGreek}"#, input: "123αβγxyz", match: "α")
firstMatchTest(#"\P{Script=Latn}"#, input: "abcαβγxyz", match: "α")
firstMatchTest(#"\p{script=Greek}"#, input: "123αβγxyz", match: "α")
firstMatchTest(#"\p{ISscript=isGreek}"#, input: "123αβγxyz", match: "α")
firstMatchTest(#"\p{scx=bamum}"#, input: "123ꚠꚡꚢxyz", match: "ꚠ")
firstMatchTest(#"\p{ISBAMUM}"#, input: "123ꚠꚡꚢxyz", match: "ꚠ")
firstMatchTest(#"\p{Script=Unknown}"#, input: "\u{10FFFF}", match: "\u{10FFFF}")
firstMatchTest(#"\p{scx=Gujr}"#, input: "\u{a839}", match: "\u{a839}")

firstMatchTest(#"\p{alpha}"#, input: "123abcXYZ", match: "a")
firstMatchTest(#"\P{alpha}"#, input: "123abcXYZ", match: "1")
Expand Down