Skip to content

Commit ba38bcb

Browse files
author
Mingsheng Hong
committed
Introduce a new 'bytes' form of the string_literal SIL instruction. Have it
print and parse as a stable hexadecimal form that isn't interpreted as UTF8. One use case is in representing serialized protobuf strings (as in the tensorflow branch: https://github.com/apple/swift/blob/f7ed452ebaabb21c866b110a885f0d8afd280ee1/lib/SILOptimizer/Mandatory/TFPartition.cpp#L3875). The original work was done by @lattner and merged into the tensorflow branch. This PR is to upstream those changes.
1 parent 4483276 commit ba38bcb

File tree

10 files changed

+47
-3
lines changed

10 files changed

+47
-3
lines changed

include/swift/SIL/SILInstruction.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2894,6 +2894,7 @@ class StringLiteralInst final
28942894

28952895
public:
28962896
enum class Encoding {
2897+
Bytes,
28972898
UTF8,
28982899
UTF16,
28992900
/// UTF-8 encoding of an Objective-C selector.

lib/IRGen/GenConstant.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ llvm::Constant *irgen::emitConstantFP(IRGenModule &IGM, FloatLiteralInst *FLI) {
5454
llvm::Constant *irgen::emitAddrOfConstantString(IRGenModule &IGM,
5555
StringLiteralInst *SLI) {
5656
switch (SLI->getEncoding()) {
57+
case StringLiteralInst::Encoding::Bytes:
5758
case StringLiteralInst::Encoding::UTF8:
5859
return IGM.getAddrOfGlobalString(SLI->getValue());
5960

lib/ParseSIL/ParseSIL.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2364,6 +2364,8 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
23642364
encoding = StringLiteralInst::Encoding::UTF16;
23652365
} else if (P.Tok.getText() == "objc_selector") {
23662366
encoding = StringLiteralInst::Encoding::ObjCSelector;
2367+
} else if (P.Tok.getText() == "bytes") {
2368+
encoding = StringLiteralInst::Encoding::Bytes;
23672369
} else {
23682370
P.diagnose(P.Tok, diag::sil_string_invalid_encoding, P.Tok.getText());
23692371
return true;
@@ -2380,6 +2382,30 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
23802382

23812383
// Ask the lexer to interpret the entire string as a literal segment.
23822384
SmallVector<char, 128> stringBuffer;
2385+
2386+
if (encoding == StringLiteralInst::Encoding::Bytes) {
2387+
// Decode hex bytes.
2388+
if (rawString.size() & 1) {
2389+
P.diagnose(P.Tok, diag::expected_tok_in_sil_instr,
2390+
"even number of hex bytes");
2391+
return true;
2392+
}
2393+
while (!rawString.empty()) {
2394+
unsigned byte1 = llvm::hexDigitValue(rawString[0]);
2395+
unsigned byte2 = llvm::hexDigitValue(rawString[1]);
2396+
if (byte1 == -1U || byte2 == -1U) {
2397+
P.diagnose(P.Tok, diag::expected_tok_in_sil_instr,
2398+
"hex bytes should contain 0-9, a-f, A-F only");
2399+
return true;
2400+
}
2401+
stringBuffer.push_back((unsigned char)(byte1 << 4) | byte2);
2402+
rawString = rawString.drop_front(2);
2403+
}
2404+
2405+
ResultVal = B.createStringLiteral(InstLoc, stringBuffer, encoding);
2406+
break;
2407+
}
2408+
23832409
StringRef string = P.L->getEncodedStringSegment(rawString, stringBuffer);
23842410
P.consumeToken(tok::string_literal);
23852411
if (parseSILDebugLocation(InstLoc, B))

lib/SIL/SILGlobalVariable.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ bool SILGlobalVariable::isValidStaticInitializerInst(const SILInstruction *I,
106106
}
107107
case SILInstructionKind::StringLiteralInst:
108108
switch (cast<StringLiteralInst>(I)->getEncoding()) {
109+
case StringLiteralInst::Encoding::Bytes:
109110
case StringLiteralInst::Encoding::UTF8:
110111
case StringLiteralInst::Encoding::UTF16:
111112
return true;

lib/SIL/SILPrinter.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1210,6 +1210,7 @@ class SILPrinter : public SILInstructionVisitor<SILPrinter> {
12101210
}
12111211
static StringRef getStringEncodingName(StringLiteralInst::Encoding kind) {
12121212
switch (kind) {
1213+
case StringLiteralInst::Encoding::Bytes: return "bytes ";
12131214
case StringLiteralInst::Encoding::UTF8: return "utf8 ";
12141215
case StringLiteralInst::Encoding::UTF16: return "utf16 ";
12151216
case StringLiteralInst::Encoding::ObjCSelector: return "objc_selector ";
@@ -1218,8 +1219,17 @@ class SILPrinter : public SILInstructionVisitor<SILPrinter> {
12181219
}
12191220

12201221
void visitStringLiteralInst(StringLiteralInst *SLI) {
1221-
*this << getStringEncodingName(SLI->getEncoding())
1222-
<< QuotedString(SLI->getValue());
1222+
*this << getStringEncodingName(SLI->getEncoding());
1223+
1224+
if (SLI->getEncoding() != StringLiteralInst::Encoding::Bytes) {
1225+
// FIXME: this isn't correct: this doesn't properly handle translating
1226+
// UTF16 into UTF8, and the SIL parser always parses as UTF8.
1227+
*this << QuotedString(SLI->getValue());
1228+
return;
1229+
}
1230+
1231+
// "Bytes" are always output in a hexadecimal form.
1232+
*this << '"' << llvm::toHex(SLI->getValue()) << '"';
12231233
}
12241234

12251235
static StringRef

lib/SILGen/SILGenApply.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1522,8 +1522,9 @@ static RValue emitStringLiteral(SILGenFunction &SGF, Expr *E, StringRef Str,
15221522
TypeElts = TypeEltsArray;
15231523
break;
15241524

1525+
case StringLiteralInst::Encoding::Bytes:
15251526
case StringLiteralInst::Encoding::ObjCSelector:
1526-
llvm_unreachable("Objective-C selectors cannot be formed here");
1527+
llvm_unreachable("these cannot be formed here");
15271528
}
15281529

15291530
CanType ty =

lib/SILOptimizer/Utils/SpecializationMangler.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ FunctionSignatureSpecializationMangler::mangleConstantProp(LiteralInst *LI) {
231231

232232
ArgOpBuffer << 's';
233233
switch (SLI->getEncoding()) {
234+
case StringLiteralInst::Encoding::Bytes: ArgOpBuffer << 'B'; break;
234235
case StringLiteralInst::Encoding::UTF8: ArgOpBuffer << 'b'; break;
235236
case StringLiteralInst::Encoding::UTF16: ArgOpBuffer << 'w'; break;
236237
case StringLiteralInst::Encoding::ObjCSelector: ArgOpBuffer << 'c'; break;

lib/Serialization/DeserializeSIL.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ STATISTIC(NumDeserializedFunc, "Number of deserialized SIL functions");
4949
static Optional<StringLiteralInst::Encoding>
5050
fromStableStringEncoding(unsigned value) {
5151
switch (value) {
52+
case SIL_BYTES: return StringLiteralInst::Encoding::Bytes;
5253
case SIL_UTF8: return StringLiteralInst::Encoding::UTF8;
5354
case SIL_UTF16: return StringLiteralInst::Encoding::UTF16;
5455
case SIL_OBJC_SELECTOR: return StringLiteralInst::Encoding::ObjCSelector;

lib/Serialization/SILFormat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ enum SILStringEncoding : uint8_t {
3333
SIL_UTF8,
3434
SIL_UTF16,
3535
SIL_OBJC_SELECTOR,
36+
SIL_BYTES
3637
};
3738

3839
enum SILLinkageEncoding : uint8_t {

lib/Serialization/SerializeSIL.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ using llvm::BCBlockRAII;
4444

4545
static unsigned toStableStringEncoding(StringLiteralInst::Encoding encoding) {
4646
switch (encoding) {
47+
case StringLiteralInst::Encoding::Bytes: return SIL_BYTES;
4748
case StringLiteralInst::Encoding::UTF8: return SIL_UTF8;
4849
case StringLiteralInst::Encoding::UTF16: return SIL_UTF16;
4950
case StringLiteralInst::Encoding::ObjCSelector: return SIL_OBJC_SELECTOR;

0 commit comments

Comments
 (0)