Skip to content

Commit 6636815

Browse files
authored
Merge pull request #20315 from milseman/utf8string
[String] Use a UTF-8 representation for native strings
2 parents 5c03a0a + fee2787 commit 6636815

File tree

105 files changed

+50414
-49864
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+50414
-49864
lines changed

benchmark/single-source/StringComparison.swift

Lines changed: 124 additions & 101 deletions
Large diffs are not rendered by default.

benchmark/single-source/StringComparison.swift.gyb

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,32 +30,58 @@ extension String {
3030
}
3131
}
3232

33-
% Names = ["ascii", "latin1", "fastPrenormal", "slowerPrenormal", "nonBMPSlowestPrenormal", "emoji", "abnormal", "zalgo", "longSharedPrefix"]
33+
% AllWorkloads = ["ascii", "latin1", "fastPrenormal", "slowerPrenormal", "nonBMPSlowestPrenormal", "emoji", "abnormal", "zalgo", "longSharedPrefix"]
34+
% ComparisonWorkloads = AllWorkloads
35+
% HashingWorkloads = ["ascii", "latin1", "fastPrenormal", "slowerPrenormal", "nonBMPSlowestPrenormal", "emoji", "abnormal", "zalgo", "longSharedPrefix"]
3436

35-
public let StringComparison = [
36-
% for Name in Names:
37+
// TODO(UTF8 post-merge): Disable longSharedPrefix hashing benchmark, which is
38+
// enabled here for 1-to-1 comparison vs master
39+
40+
// TODO(UTF8 post-merge): Enable NormalizedIteratorWorkloads for ["ascii",
41+
// "latin1", "fastPrenormal", "slowerPrenormal", "nonBMPSlowestPrenormal",
42+
// "emoji", "abnormal", "zalgo"]
43+
44+
% NormalizedIteratorWorkloads = []
45+
46+
public let StringComparison: [BenchmarkInfo] = [
47+
% for Name in ComparisonWorkloads:
3748
BenchmarkInfo(
3849
name: "StringComparison_${Name}",
3950
runFunction: run_StringComparison_${Name},
4051
tags: [.validation, .api, .String],
41-
setUpFunction: { blackHole(Workload_${Name}) }),
42-
% end # Names
52+
setUpFunction: { blackHole(Workload_${Name}) }
53+
),
54+
% end # ComparisonWorkloads
4355
]
4456

45-
public let StringHashing = [
46-
% for Name in Names:
57+
public let StringHashing: [BenchmarkInfo] = [
58+
% for Name in HashingWorkloads:
4759
BenchmarkInfo(
4860
name: "StringHashing_${Name}",
4961
runFunction: run_StringHashing_${Name},
5062
tags: [.validation, .api, .String],
51-
setUpFunction: { blackHole(Workload_${Name}) }),
52-
% end # Names
63+
setUpFunction: { blackHole(Workload_${Name}) }
64+
),
65+
% end # HashingWorkloads
5366
]
5467

55-
% for Name in Names:
68+
public let NormalizedIterator: [BenchmarkInfo] = [
69+
% for Name in NormalizedIteratorWorkloads:
70+
BenchmarkInfo(
71+
name: "NormalizedIterator_${Name}",
72+
runFunction: run_NormalizedIterator_${Name},
73+
tags: [.validation, .String],
74+
setUpFunction: { blackHole(Workload_${Name}) }
75+
),
76+
% end # NormalizedIteratorWorkloads
77+
]
5678

79+
% for Name in AllWorkloads:
5780
var Workload_${Name}: Workload! = Workload.${Name}
5881

82+
% end # AllWorkloads
83+
84+
%for Name in ComparisonWorkloads:
5985
@inline(never)
6086
public func run_StringComparison_${Name}(_ N: Int) {
6187
let workload: Workload = Workload_${Name}
@@ -70,6 +96,9 @@ public func run_StringComparison_${Name}(_ N: Int) {
7096
}
7197
}
7298

99+
% end # ComparisonWorkloads
100+
101+
%for Name in HashingWorkloads:
73102
@inline(never)
74103
public func run_StringHashing_${Name}(_ N: Int) {
75104
let workload: Workload = Workload.${Name}
@@ -81,8 +110,25 @@ public func run_StringHashing_${Name}(_ N: Int) {
81110
}
82111
}
83112
}
84-
85-
% end # Names
113+
114+
% end # HashingWorkloads
115+
116+
%for Name in NormalizedIteratorWorkloads:
117+
@inline(never)
118+
public func run_NormalizedIterator_${Name}(_ N: Int) {
119+
let workload: Workload = Workload.${Name}
120+
let tripCount = workload.tripCount
121+
let payload = workload.payload
122+
for _ in 1...tripCount*N {
123+
for str in payload {
124+
str._withNFCCodeUnits { cu in
125+
blackHole(cu)
126+
}
127+
}
128+
}
129+
}
130+
131+
% end # NormalizedIteratorWorkloads
86132

87133
struct Workload {
88134
static let N = 100

benchmark/utils/main.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ registerBenchmark(NSErrorTest)
266266
registerBenchmark(NSStringConversion)
267267
registerBenchmark(NibbleSort)
268268
registerBenchmark(NopDeinit)
269+
registerBenchmark(NormalizedIterator)
269270
registerBenchmark(ObjectAllocation)
270271
#if os(macOS) || os(iOS) || os(watchOS) || os(tvOS)
271272
registerBenchmark(ObjectiveCBridging)

include/swift/AST/DiagnosticsSema.def

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2825,8 +2825,6 @@ ERROR(builtin_unicode_scalar_literal_broken_proto,none,
28252825
ERROR(unicode_scalar_literal_broken_proto,none,
28262826
"protocol 'ExpressibleByUnicodeScalarLiteral' is broken", ())
28272827

2828-
ERROR(builtin_utf16_extended_grapheme_cluster_literal_broken_proto,none,
2829-
"protocol '_ExpressibleByBuiltinUTF16ExtendedGraphemeClusterLiteral' is broken", ())
28302828
ERROR(builtin_extended_grapheme_cluster_literal_broken_proto,none,
28312829
"protocol '_ExpressibleByBuiltinExtendedGraphemeClusterLiteral' is broken", ())
28322830
ERROR(extended_grapheme_cluster_literal_broken_proto,none,

include/swift/AST/KnownIdentifiers.def

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,6 @@ IDENTIFIER_(builtinUnicodeScalarLiteral)
161161
IDENTIFIER(unicodeScalarLiteral)
162162

163163
IDENTIFIER(stringLiteral)
164-
IDENTIFIER_(builtinUTF16StringLiteral)
165164
IDENTIFIER_(builtinStringLiteral)
166165
IDENTIFIER(StringLiteralType)
167166
IDENTIFIER(stringInterpolation)

include/swift/AST/KnownProtocols.def

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,10 @@ EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByImageLiteral)
8888
EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByFileReferenceLiteral)
8989

9090
BUILTIN_EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByBuiltinBooleanLiteral)
91-
BUILTIN_EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByBuiltinUTF16ExtendedGraphemeClusterLiteral)
9291
BUILTIN_EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByBuiltinExtendedGraphemeClusterLiteral)
9392
BUILTIN_EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByBuiltinFloatLiteral)
9493
BUILTIN_EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByBuiltinIntegerLiteral)
9594
BUILTIN_EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByBuiltinStringLiteral)
96-
BUILTIN_EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByBuiltinUTF16StringLiteral)
9795
BUILTIN_EXPRESSIBLE_BY_LITERAL_PROTOCOL_(ExpressibleByBuiltinUnicodeScalarLiteral)
9896

9997
#undef EXPRESSIBLE_BY_LITERAL_PROTOCOL

lib/IRGen/GenMeta.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4040,12 +4040,10 @@ SpecialProtocol irgen::getSpecialProtocolID(ProtocolDecl *P) {
40404040
case KnownProtocolKind::ExpressibleByImageLiteral:
40414041
case KnownProtocolKind::ExpressibleByFileReferenceLiteral:
40424042
case KnownProtocolKind::ExpressibleByBuiltinBooleanLiteral:
4043-
case KnownProtocolKind::ExpressibleByBuiltinUTF16ExtendedGraphemeClusterLiteral:
40444043
case KnownProtocolKind::ExpressibleByBuiltinExtendedGraphemeClusterLiteral:
40454044
case KnownProtocolKind::ExpressibleByBuiltinFloatLiteral:
40464045
case KnownProtocolKind::ExpressibleByBuiltinIntegerLiteral:
40474046
case KnownProtocolKind::ExpressibleByBuiltinStringLiteral:
4048-
case KnownProtocolKind::ExpressibleByBuiltinUTF16StringLiteral:
40494047
case KnownProtocolKind::ExpressibleByBuiltinUnicodeScalarLiteral:
40504048
case KnownProtocolKind::OptionSet:
40514049
case KnownProtocolKind::BridgedNSError:

lib/SILOptimizer/Utils/Local.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -732,16 +732,10 @@ bool StringConcatenationOptimizer::extractStringConcatOperands() {
732732
auto AILeftOperandsNum = AILeft->getNumOperands();
733733
auto AIRightOperandsNum = AIRight->getNumOperands();
734734

735-
// makeUTF16 should have following parameters:
736-
// (start: RawPointer, utf16CodeUnitCount: Word)
737735
// makeUTF8 should have following parameters:
738736
// (start: RawPointer, utf8CodeUnitCount: Word, isASCII: Int1)
739-
if (!((FRILeftFun->hasSemanticsAttr("string.makeUTF16") &&
740-
AILeftOperandsNum == 4) ||
741-
(FRILeftFun->hasSemanticsAttr("string.makeUTF8") &&
737+
if (!((FRILeftFun->hasSemanticsAttr("string.makeUTF8") &&
742738
AILeftOperandsNum == 5) ||
743-
(FRIRightFun->hasSemanticsAttr("string.makeUTF16") &&
744-
AIRightOperandsNum == 4) ||
745739
(FRIRightFun->hasSemanticsAttr("string.makeUTF8") &&
746740
AIRightOperandsNum == 5)))
747741
return false;

lib/Sema/CSApply.cpp

Lines changed: 11 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -2094,55 +2094,24 @@ namespace {
20942094
Diag<> brokenBuiltinProtocolDiag;
20952095

20962096
if (isStringLiteral) {
2097-
// If the string contains only ASCII, force a UTF8 representation
2098-
bool forceASCII = stringLiteral != nullptr;
2099-
if (forceASCII) {
2100-
for (auto c: stringLiteral->getValue()) {
2101-
if (c & (1 << 7)) {
2102-
forceASCII = false;
2103-
break;
2104-
}
2105-
}
2106-
}
2107-
21082097
literalType = tc.Context.Id_StringLiteralType;
21092098

21102099
literalFuncName = DeclName(tc.Context, DeclBaseName::createConstructor(),
21112100
{ tc.Context.Id_stringLiteral });
21122101

2113-
// If the string contains non-ASCII and the type can handle
2114-
// UTF-16 string literals, prefer them.
21152102
builtinProtocol = tc.getProtocol(
21162103
expr->getLoc(),
2117-
KnownProtocolKind::ExpressibleByBuiltinUTF16StringLiteral);
2118-
2119-
if (!forceASCII && (tc.conformsToProtocol(
2120-
type, builtinProtocol, cs.DC,
2121-
ConformanceCheckFlags::InExpression))) {
2122-
builtinLiteralFuncName =
2123-
DeclName(tc.Context, DeclBaseName::createConstructor(),
2124-
{tc.Context.Id_builtinUTF16StringLiteral,
2125-
tc.Context.getIdentifier("utf16CodeUnitCount")});
2126-
2127-
if (stringLiteral)
2128-
stringLiteral->setEncoding(StringLiteralExpr::UTF16);
2129-
else
2130-
magicLiteral->setStringEncoding(StringLiteralExpr::UTF16);
2131-
} else {
2132-
// Otherwise, fall back to UTF-8.
2133-
builtinProtocol = tc.getProtocol(
2134-
expr->getLoc(),
2135-
KnownProtocolKind::ExpressibleByBuiltinStringLiteral);
2136-
builtinLiteralFuncName
2137-
= DeclName(tc.Context, DeclBaseName::createConstructor(),
2138-
{ tc.Context.Id_builtinStringLiteral,
2139-
tc.Context.getIdentifier("utf8CodeUnitCount"),
2140-
tc.Context.getIdentifier("isASCII") });
2141-
if (stringLiteral)
2142-
stringLiteral->setEncoding(StringLiteralExpr::UTF8);
2143-
else
2144-
magicLiteral->setStringEncoding(StringLiteralExpr::UTF8);
2145-
}
2104+
KnownProtocolKind::ExpressibleByBuiltinStringLiteral);
2105+
builtinLiteralFuncName
2106+
= DeclName(tc.Context, DeclBaseName::createConstructor(),
2107+
{ tc.Context.Id_builtinStringLiteral,
2108+
tc.Context.getIdentifier("utf8CodeUnitCount"),
2109+
tc.Context.getIdentifier("isASCII") });
2110+
if (stringLiteral)
2111+
stringLiteral->setEncoding(StringLiteralExpr::UTF8);
2112+
else
2113+
magicLiteral->setStringEncoding(StringLiteralExpr::UTF8);
2114+
21462115
brokenProtocolDiag = diag::string_literal_broken_proto;
21472116
brokenBuiltinProtocolDiag = diag::builtin_string_literal_broken_proto;
21482117
} else if (isGraphemeClusterLiteral) {
@@ -2163,26 +2132,6 @@ namespace {
21632132
diag::extended_grapheme_cluster_literal_broken_proto;
21642133
brokenBuiltinProtocolDiag =
21652134
diag::builtin_extended_grapheme_cluster_literal_broken_proto;
2166-
2167-
auto *builtinUTF16ExtendedGraphemeClusterProtocol = tc.getProtocol(
2168-
expr->getLoc(),
2169-
KnownProtocolKind::ExpressibleByBuiltinUTF16ExtendedGraphemeClusterLiteral);
2170-
if (tc.conformsToProtocol(type,
2171-
builtinUTF16ExtendedGraphemeClusterProtocol,
2172-
cs.DC, ConformanceCheckFlags::InExpression)) {
2173-
builtinLiteralFuncName
2174-
= DeclName(tc.Context, DeclBaseName::createConstructor(),
2175-
{ tc.Context.Id_builtinExtendedGraphemeClusterLiteral,
2176-
tc.Context.getIdentifier("utf16CodeUnitCount") });
2177-
2178-
builtinProtocol = builtinUTF16ExtendedGraphemeClusterProtocol;
2179-
brokenBuiltinProtocolDiag =
2180-
diag::builtin_utf16_extended_grapheme_cluster_literal_broken_proto;
2181-
if (stringLiteral)
2182-
stringLiteral->setEncoding(StringLiteralExpr::UTF16);
2183-
else
2184-
magicLiteral->setStringEncoding(StringLiteralExpr::UTF16);
2185-
}
21862135
} else {
21872136
// Otherwise, we should have just one Unicode scalar.
21882137
literalType = tc.Context.Id_UnicodeScalarLiteralType;

stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,108 @@
1212

1313
import StdlibUnittest
1414

15+
extension String {
16+
func parseUTF8CodeUnits() -> [UInt8] {
17+
var utf8 = [UInt8]()
18+
let units = self.split(separator: " ")
19+
let scalars = units.compactMap { string -> Unicode.Scalar? in
20+
let i = Int(string, radix: 16)!
21+
return Unicode.Scalar(i)
22+
23+
}
24+
25+
for scalar in scalars {
26+
utf8 += String(scalar).utf8
27+
}
28+
return utf8
29+
}
30+
31+
func parseUTF16CodeUnits() -> [UInt16] {
32+
var utf16 = [UInt16]()
33+
let units = self.split(separator: " ")
34+
let scalars = units.compactMap { string -> Unicode.Scalar? in
35+
let i = Int(string, radix: 16)!
36+
return Unicode.Scalar(i)
37+
}
38+
39+
for scalar in scalars {
40+
utf16 += scalar.utf16
41+
}
42+
return utf16
43+
}
44+
}
45+
46+
public struct NormalizationTest {
47+
public let loc: SourceLoc
48+
public let sourceUTF16: [UInt16]
49+
public let source: [UInt8]
50+
public let NFC: [UInt8]
51+
public let NFD: [UInt8]
52+
public let NFKC: [UInt8]
53+
public let NFKD: [UInt8]
54+
55+
init(
56+
loc: SourceLoc,
57+
source: String,
58+
NFC: String,
59+
NFD: String,
60+
NFKC: String,
61+
NFKD: String
62+
) {
63+
self.loc = loc
64+
self.sourceUTF16 = source.parseUTF16CodeUnits()
65+
self.source = source.parseUTF8CodeUnits()
66+
self.NFC = NFC.parseUTF8CodeUnits()
67+
self.NFD = NFD.parseUTF8CodeUnits()
68+
self.NFKC = NFKC.parseUTF8CodeUnits()
69+
self.NFKD = NFKD.parseUTF8CodeUnits()
70+
}
71+
}
72+
73+
// Normalization tests are currently only avaible on Darwin, awaiting a sensible
74+
// file API...
75+
#if _runtime(_ObjC)
76+
import Foundation
77+
public let normalizationTests: [NormalizationTest] = {
78+
var tests = [NormalizationTest]()
79+
80+
let file = CommandLine.arguments[2]
81+
let fileURL = URL(fileURLWithPath: file)
82+
83+
let fileContents = try! String(contentsOf: fileURL) + "" // go faster
84+
85+
var lineNumber: UInt = 0
86+
for line in fileContents.split(separator: "\n") {
87+
lineNumber += 1
88+
guard line.hasPrefix("#") == false else {
89+
continue
90+
}
91+
92+
let content = line.split(separator: "#").first!
93+
94+
guard !content.isEmpty else {
95+
continue
96+
}
97+
guard !content.hasPrefix("@") else {
98+
continue
99+
}
100+
101+
let columns = content.split(separator: ";").filter { $0 != " " }.map(String.init)
102+
let test = NormalizationTest(
103+
loc: SourceLoc(file, lineNumber),
104+
source: columns[0],
105+
NFC: columns[1],
106+
NFD: columns[2],
107+
NFKC: columns[3],
108+
NFKD: columns[4])
109+
110+
tests.append(test)
111+
}
112+
113+
return tests
114+
}()
115+
#endif
116+
15117
public struct UTFTest {
16118
public struct Flags : OptionSet {
17119
public let rawValue: Int

stdlib/private/StdlibUnittest/StdlibCoreExtras.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import Foundation
2828
//
2929

3030
func findSubstring(_ haystack: Substring, _ needle: String) -> String.Index? {
31-
return findSubstring(String(haystack._ephemeralContent), needle)
31+
return findSubstring(haystack._ephemeralString, needle)
3232
}
3333

3434
func findSubstring(_ string: String, _ substring: String) -> String.Index? {

0 commit comments

Comments
 (0)