15
15
#include " lldb/Target/Target.h"
16
16
#include " lldb/Utility/Status.h"
17
17
18
+ #include " llvm/ADT/StringExtras.h"
18
19
#include " llvm/Support/ConvertUTF.h"
19
20
20
21
#include < ctype.h>
@@ -92,7 +93,7 @@ static bool isprint32(char32_t codepoint) {
92
93
return true ;
93
94
}
94
95
95
- DecodedCharBuffer attemptASCIIEscape (char32_t c,
96
+ DecodedCharBuffer attemptASCIIEscape (llvm::UTF32 c,
96
97
StringPrinter::EscapeStyle escape_style) {
97
98
const bool is_swift_escape_style =
98
99
escape_style == StringPrinter::EscapeStyle::Swift;
@@ -141,7 +142,10 @@ DecodedCharBuffer GetPrintableImpl<StringElementType::ASCII>(
141
142
DecodedCharBuffer retval = attemptASCIIEscape (*buffer, escape_style);
142
143
if (retval.GetSize ())
143
144
return retval;
144
- if (isprint (*buffer))
145
+
146
+ // Use llvm's locale-independent isPrint(char), instead of the libc
147
+ // implementation which may give different results on different platforms.
148
+ if (llvm::isPrint (*buffer))
145
149
return {buffer, 1 };
146
150
147
151
unsigned escaped_len;
@@ -161,60 +165,30 @@ DecodedCharBuffer GetPrintableImpl<StringElementType::ASCII>(
161
165
return {data, escaped_len};
162
166
}
163
167
164
- static char32_t ConvertUTF8ToCodePoint (unsigned char c0, unsigned char c1) {
165
- return (c0 - 192 ) * 64 + (c1 - 128 );
166
- }
167
- static char32_t ConvertUTF8ToCodePoint (unsigned char c0, unsigned char c1,
168
- unsigned char c2) {
169
- return (c0 - 224 ) * 4096 + (c1 - 128 ) * 64 + (c2 - 128 );
170
- }
171
- static char32_t ConvertUTF8ToCodePoint (unsigned char c0, unsigned char c1,
172
- unsigned char c2, unsigned char c3) {
173
- return (c0 - 240 ) * 262144 + (c2 - 128 ) * 4096 + (c2 - 128 ) * 64 + (c3 - 128 );
174
- }
175
-
176
168
template <>
177
169
DecodedCharBuffer GetPrintableImpl<StringElementType::UTF8>(
178
170
uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next,
179
171
StringPrinter::EscapeStyle escape_style) {
180
- const unsigned utf8_encoded_len = llvm::getNumBytesForUTF8 (*buffer);
181
-
182
- // If the utf8 encoded length is invalid, or if there aren't enough bytes to
183
- // print, this is some kind of corrupted string.
184
- if (utf8_encoded_len == 0 || utf8_encoded_len > 4 )
185
- return nullptr ;
186
- if ((buffer_end - buffer) < utf8_encoded_len)
187
- // There's no room in the buffer for the utf8 sequence.
188
- return nullptr ;
189
-
190
- char32_t codepoint = 0 ;
191
- switch (utf8_encoded_len) {
192
- case 1 :
193
- // this is just an ASCII byte - ask ASCII
172
+ // If the utf8 encoded length is invalid (i.e., not in the closed interval
173
+ // [1;4]), or if there aren't enough bytes to print, or if the subsequence
174
+ // isn't valid utf8, fall back to printing an ASCII-escaped subsequence.
175
+ if (!llvm::isLegalUTF8Sequence (buffer, buffer_end))
194
176
return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next,
195
177
escape_style);
196
- case 2 :
197
- codepoint = ConvertUTF8ToCodePoint ((unsigned char )*buffer,
198
- (unsigned char )*(buffer + 1 ));
199
- break ;
200
- case 3 :
201
- codepoint = ConvertUTF8ToCodePoint ((unsigned char )*buffer,
202
- (unsigned char )*(buffer + 1 ),
203
- (unsigned char )*(buffer + 2 ));
204
- break ;
205
- case 4 :
206
- codepoint = ConvertUTF8ToCodePoint (
207
- (unsigned char )*buffer, (unsigned char )*(buffer + 1 ),
208
- (unsigned char )*(buffer + 2 ), (unsigned char )*(buffer + 3 ));
209
- break ;
210
- }
211
178
212
- // We couldn't figure out how to print this codepoint.
213
- if (!codepoint)
214
- return nullptr ;
179
+ // Convert the valid utf8 sequence to a utf32 codepoint. This cannot fail.
180
+ llvm::UTF32 codepoint = 0 ;
181
+ const llvm::UTF8 *buffer_for_conversion = buffer;
182
+ llvm::ConversionResult result = llvm::convertUTF8Sequence (
183
+ &buffer_for_conversion, buffer_end, &codepoint, llvm::strictConversion);
184
+ assert (result == llvm::conversionOK &&
185
+ " Failed to convert legal utf8 sequence" );
186
+ (void )result;
215
187
216
188
// The UTF8 helper always advances by the utf8 encoded length.
189
+ const unsigned utf8_encoded_len = buffer_for_conversion - buffer;
217
190
next = buffer + utf8_encoded_len;
191
+
218
192
DecodedCharBuffer retval = attemptASCIIEscape (codepoint, escape_style);
219
193
if (retval.GetSize ())
220
194
return retval;
@@ -227,11 +201,11 @@ DecodedCharBuffer GetPrintableImpl<StringElementType::UTF8>(
227
201
switch (escape_style) {
228
202
case StringPrinter::EscapeStyle::CXX:
229
203
// Prints 10 characters, then a \0 terminator.
230
- escaped_len = sprintf ((char *)data, " \\ U%08x" , ( unsigned ) codepoint);
204
+ escaped_len = sprintf ((char *)data, " \\ U%08x" , codepoint);
231
205
break ;
232
206
case StringPrinter::EscapeStyle::Swift:
233
207
// Prints up to 12 characters, then a \0 terminator.
234
- escaped_len = sprintf ((char *)data, " \\ u{%x}" , ( unsigned ) codepoint);
208
+ escaped_len = sprintf ((char *)data, " \\ u{%x}" , codepoint);
235
209
break ;
236
210
}
237
211
lldbassert (escaped_len > 0 && " unknown string escape style" );
0 commit comments