Skip to content

Commit 4699a7e

Browse files
committed
[lldb/StringPrinter] Support strings with invalid utf8 sub-sequences
Support printing strings which contain invalid utf8 sub-sequences, e.g. strings like "hello world \xfe", instead of bailing out with "Summary Unavailable". I took the opportunity here to delete some hand-rolled utf8 -> utf32 conversion code and replace it with calls into llvm's Support library. rdar://61554346
1 parent 7822b8a commit 4699a7e

File tree

4 files changed

+37
-69
lines changed

4 files changed

+37
-69
lines changed

lldb/source/DataFormatters/StringPrinter.cpp

Lines changed: 22 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "lldb/Target/Target.h"
1616
#include "lldb/Utility/Status.h"
1717

18+
#include "llvm/ADT/StringExtras.h"
1819
#include "llvm/Support/ConvertUTF.h"
1920

2021
#include <ctype.h>
@@ -92,7 +93,7 @@ static bool isprint32(char32_t codepoint) {
9293
return true;
9394
}
9495

95-
DecodedCharBuffer attemptASCIIEscape(char32_t c,
96+
DecodedCharBuffer attemptASCIIEscape(llvm::UTF32 c,
9697
StringPrinter::EscapeStyle escape_style) {
9798
const bool is_swift_escape_style =
9899
escape_style == StringPrinter::EscapeStyle::Swift;
@@ -141,7 +142,10 @@ DecodedCharBuffer GetPrintableImpl<StringElementType::ASCII>(
141142
DecodedCharBuffer retval = attemptASCIIEscape(*buffer, escape_style);
142143
if (retval.GetSize())
143144
return retval;
144-
if (isprint(*buffer))
145+
146+
// Use llvm's locale-independent isPrint(char), instead of the libc
147+
// implementation which may give different results on different platforms.
148+
if (llvm::isPrint(*buffer))
145149
return {buffer, 1};
146150

147151
unsigned escaped_len;
@@ -161,60 +165,30 @@ DecodedCharBuffer GetPrintableImpl<StringElementType::ASCII>(
161165
return {data, escaped_len};
162166
}
163167

164-
static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1) {
165-
return (c0 - 192) * 64 + (c1 - 128);
166-
}
167-
static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1,
168-
unsigned char c2) {
169-
return (c0 - 224) * 4096 + (c1 - 128) * 64 + (c2 - 128);
170-
}
171-
static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1,
172-
unsigned char c2, unsigned char c3) {
173-
return (c0 - 240) * 262144 + (c2 - 128) * 4096 + (c2 - 128) * 64 + (c3 - 128);
174-
}
175-
176168
template <>
177169
DecodedCharBuffer GetPrintableImpl<StringElementType::UTF8>(
178170
uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next,
179171
StringPrinter::EscapeStyle escape_style) {
180-
const unsigned utf8_encoded_len = llvm::getNumBytesForUTF8(*buffer);
181-
182-
// If the utf8 encoded length is invalid, or if there aren't enough bytes to
183-
// print, this is some kind of corrupted string.
184-
if (utf8_encoded_len == 0 || utf8_encoded_len > 4)
185-
return nullptr;
186-
if ((buffer_end - buffer) < utf8_encoded_len)
187-
// There's no room in the buffer for the utf8 sequence.
188-
return nullptr;
189-
190-
char32_t codepoint = 0;
191-
switch (utf8_encoded_len) {
192-
case 1:
193-
// this is just an ASCII byte - ask ASCII
172+
// If the utf8 encoded length is invalid (i.e., not in the closed interval
173+
// [1;4]), or if there aren't enough bytes to print, or if the subsequence
174+
// isn't valid utf8, fall back to printing an ASCII-escaped subsequence.
175+
if (!llvm::isLegalUTF8Sequence(buffer, buffer_end))
194176
return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next,
195177
escape_style);
196-
case 2:
197-
codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer,
198-
(unsigned char)*(buffer + 1));
199-
break;
200-
case 3:
201-
codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer,
202-
(unsigned char)*(buffer + 1),
203-
(unsigned char)*(buffer + 2));
204-
break;
205-
case 4:
206-
codepoint = ConvertUTF8ToCodePoint(
207-
(unsigned char)*buffer, (unsigned char)*(buffer + 1),
208-
(unsigned char)*(buffer + 2), (unsigned char)*(buffer + 3));
209-
break;
210-
}
211178

212-
// We couldn't figure out how to print this codepoint.
213-
if (!codepoint)
214-
return nullptr;
179+
// Convert the valid utf8 sequence to a utf32 codepoint. This cannot fail.
180+
llvm::UTF32 codepoint = 0;
181+
const llvm::UTF8 *buffer_for_conversion = buffer;
182+
llvm::ConversionResult result = llvm::convertUTF8Sequence(
183+
&buffer_for_conversion, buffer_end, &codepoint, llvm::strictConversion);
184+
assert(result == llvm::conversionOK &&
185+
"Failed to convert legal utf8 sequence");
186+
(void)result;
215187

216188
// The UTF8 helper always advances by the utf8 encoded length.
189+
const unsigned utf8_encoded_len = buffer_for_conversion - buffer;
217190
next = buffer + utf8_encoded_len;
191+
218192
DecodedCharBuffer retval = attemptASCIIEscape(codepoint, escape_style);
219193
if (retval.GetSize())
220194
return retval;
@@ -227,11 +201,11 @@ DecodedCharBuffer GetPrintableImpl<StringElementType::UTF8>(
227201
switch (escape_style) {
228202
case StringPrinter::EscapeStyle::CXX:
229203
// Prints 10 characters, then a \0 terminator.
230-
escaped_len = sprintf((char *)data, "\\U%08x", (unsigned)codepoint);
204+
escaped_len = sprintf((char *)data, "\\U%08x", codepoint);
231205
break;
232206
case StringPrinter::EscapeStyle::Swift:
233207
// Prints up to 12 characters, then a \0 terminator.
234-
escaped_len = sprintf((char *)data, "\\u{%x}", (unsigned)codepoint);
208+
escaped_len = sprintf((char *)data, "\\u{%x}", codepoint);
235209
break;
236210
}
237211
lldbassert(escaped_len > 0 && "unknown string escape style");

lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def cleanup():
120120
is_alternate_layout = ('arm' in self.getArchitecture()) and self.platformIsDarwin()
121121
if is_64_bit and not is_alternate_layout:
122122
self.expect("frame variable garbage1", substrs=['garbage1 = Summary Unavailable'])
123-
self.expect("frame variable garbage2", substrs=['garbage2 = Summary Unavailable'])
124-
self.expect("frame variable garbage3", substrs=['garbage3 = Summary Unavailable'])
123+
self.expect("frame variable garbage2", substrs=[r'garbage2 = "\xfa\xfa\xfa\xfa"'])
124+
self.expect("frame variable garbage3", substrs=[r'garbage3 = "\xf0\xf0"'])
125125
self.expect("frame variable garbage4", substrs=['garbage4 = Summary Unavailable'])
126126
self.expect("frame variable garbage5", substrs=['garbage5 = Summary Unavailable'])

lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/main.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ static struct {
1818
// A corrupt libcxx string in long mode with a payload that contains a utf8
1919
// sequence that's inherently too long.
2020
static unsigned char garbage_utf8_payload1[] = {
21-
250 // This means that we expect a 5-byte sequence, this is invalid.
21+
250, // This means that we expect a 5-byte sequence, this is invalid. LLDB
22+
// should fall back to ASCII printing.
23+
250, 250, 250
2224
};
2325
static struct {
2426
uint64_t cap = 5;
@@ -29,13 +31,14 @@ static struct {
2931
// A corrupt libcxx string in long mode with a payload that contains a utf8
3032
// sequence that's too long to fit in the buffer.
3133
static unsigned char garbage_utf8_payload2[] = {
32-
240 // This means that we expect a 4-byte sequence, but the buffer is too
33-
// small for this.
34+
240, // This means that we expect a 4-byte sequence, but the buffer is too
35+
// small for this. LLDB should fall back to ASCII printing.
36+
240
3437
};
3538
static struct {
3639
uint64_t cap = 3;
3740
uint64_t size = 2;
38-
unsigned char *data = &garbage_utf8_payload1[0];
41+
unsigned char *data = &garbage_utf8_payload2[0];
3942
} garbage_string_long_mode2;
4043

4144
// A corrupt libcxx string which has an invalid size (i.e. a size greater than

lldb/unittests/DataFormatter/StringPrinterTests.cpp

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,8 @@ TEST(StringPrinterTests, CxxASCII) {
7777
EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C"));
7878
EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348"));
7979

80-
// FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds
81-
// that these are not valid utf8 sequences, but that's OK, the raw values
82-
// should still be printed out.
83-
EXPECT_NE(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal.
84-
EXPECT_NE(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal.
80+
EXPECT_EQ(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal.
81+
EXPECT_EQ(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal.
8582
}
8683

8784
// Test UTF8 formatting for C++.
@@ -114,11 +111,8 @@ TEST(StringPrinterTests, CxxUTF8) {
114111
EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C"));
115112
EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348"));
116113

117-
// FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds
118-
// that these are not valid utf8 sequences, but that's OK, the raw values
119-
// should still be printed out.
120-
EXPECT_NE(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal.
121-
EXPECT_NE(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal.
114+
EXPECT_EQ(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal.
115+
EXPECT_EQ(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal.
122116
}
123117

124118
// Test UTF8 formatting for Swift.
@@ -151,9 +145,6 @@ TEST(StringPrinterTests, SwiftUTF8) {
151145
EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C"));
152146
EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348"));
153147

154-
// FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds
155-
// that these are not valid utf8 sequences, but that's OK, the raw values
156-
// should still be printed out.
157-
EXPECT_NE(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal.
158-
EXPECT_NE(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal.
148+
EXPECT_EQ(fmt("\376"), QUOTE(R"(\u{fe})")); // \376 is 254 in decimal.
149+
EXPECT_EQ(fmt("\xfe"), QUOTE(R"(\u{fe})")); // \xfe is 254 in decimal.
159150
}

0 commit comments

Comments
 (0)