Skip to content

Commit 5edd379

Browse files
authored
Merge pull request #70763 from pinkjuice66/parse-refine-utf8validation
[Parse] Refine UTF8 validation-related aspects
2 parents 5d2454f + 50d6b1f commit 5edd379

File tree

2 files changed

+77
-15
lines changed

2 files changed

+77
-15
lines changed

lib/Parse/Lexer.cpp

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -91,18 +91,12 @@ static bool EncodeToUTF8(unsigned CharValue,
9191
return false;
9292
}
9393

94-
95-
/// CLO8 - Return the number of leading ones in the specified 8-bit value.
96-
static unsigned CLO8(unsigned char C) {
97-
return llvm::countl_one(uint32_t(C) << 24);
98-
}
99-
10094
/// isStartOfUTF8Character - Return true if this isn't a UTF8 continuation
10195
/// character, which will be of the form 0b10XXXXXX
10296
static bool isStartOfUTF8Character(unsigned char C) {
10397
// RFC 2279: The octet values FE and FF never appear.
10498
// RFC 3629: The octet values C0, C1, F5 to FF never appear.
105-
return C <= 0x80 || (C >= 0xC2 && C < 0xF5);
99+
return C < 0x80 || (C >= 0xC2 && C < 0xF5);
106100
}
107101

108102
/// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a
@@ -117,25 +111,26 @@ uint32_t swift::validateUTF8CharacterAndAdvance(const char *&Ptr,
117111
if (CurByte < 0x80)
118112
return CurByte;
119113

120-
// Read the number of high bits set, which indicates the number of bytes in
121-
// the character.
122-
unsigned EncodedBytes = CLO8(CurByte);
123-
124-
// If this is 0b10XXXXXX, then it is a continuation character.
125-
if (EncodedBytes == 1 ||
126-
!isStartOfUTF8Character(CurByte)) {
114+
// If this is not the start of a UTF8 character,
115+
// then it is either a continuation byte or an invalid UTF8 code point.
116+
if (!isStartOfUTF8Character(CurByte)) {
127117
// Skip until we get the start of another character. This is guaranteed to
128118
// at least stop at the nul at the end of the buffer.
129119
while (Ptr < End && !isStartOfUTF8Character(*Ptr))
130120
++Ptr;
131121
return ~0U;
132122
}
133123

124+
// Read the number of high bits set, which indicates the number of bytes in
125+
// the character.
126+
unsigned char EncodedBytes = llvm::countl_one(CurByte);
127+
assert((EncodedBytes >= 2 && EncodedBytes <= 4));
128+
134129
// Drop the high bits indicating the # bytes of the result.
135130
unsigned CharValue = (unsigned char)(CurByte << EncodedBytes) >> EncodedBytes;
136131

137132
// Read and validate the continuation bytes.
138-
for (unsigned i = 1; i != EncodedBytes; ++i) {
133+
for (unsigned char i = 1; i != EncodedBytes; ++i) {
139134
if (Ptr >= End)
140135
return ~0U;
141136
CurByte = *Ptr;

unittests/Parse/LexerTests.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,50 @@ TEST_F(LexerTest, RestoreStopAtCodeCompletion) {
519519
ASSERT_EQ(tok::eof, Tok.getKind());
520520
}
521521

522+
TEST_F(LexerTest, CharactersContainTheEdgeContinuationByte) {
523+
// A continuation byte must be in the range greater than or
524+
// equal to 0x80 and less than or equal to 0xBF
525+
526+
// À(0xC3 0x80), 㗀(0xE3 0x97 0x80), 🀀(0xF0 0x9F 0x80 0x80),
527+
// ÿ(0xC3 0xBF), 俿(0xE4 0xBF 0xBF), 𐐿(0xF0 0x90 0x90 0xBF)
528+
const char *Source = "À 㗀 🀀 ÿ 俿 𐐿";
529+
530+
LangOptions LangOpts;
531+
SourceManager SourceMgr;
532+
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
533+
534+
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
535+
536+
Token Tok;
537+
538+
L.lex(Tok);
539+
ASSERT_EQ(tok::identifier, Tok.getKind());
540+
ASSERT_EQ("À", Tok.getText());
541+
542+
L.lex(Tok);
543+
ASSERT_EQ(tok::identifier, Tok.getKind());
544+
ASSERT_EQ("", Tok.getText());
545+
546+
L.lex(Tok);
547+
ASSERT_EQ(tok::identifier, Tok.getKind());
548+
ASSERT_EQ("🀀", Tok.getText());
549+
550+
L.lex(Tok);
551+
ASSERT_EQ(tok::identifier, Tok.getKind());
552+
ASSERT_EQ("ÿ", Tok.getText());
553+
554+
L.lex(Tok);
555+
ASSERT_EQ(tok::identifier, Tok.getKind());
556+
ASSERT_EQ("俿", Tok.getText());
557+
558+
L.lex(Tok);
559+
ASSERT_EQ(tok::identifier, Tok.getKind());
560+
ASSERT_EQ("𐐿", Tok.getText());
561+
562+
L.lex(Tok);
563+
ASSERT_EQ(tok::eof, Tok.getKind());
564+
}
565+
522566
TEST_F(LexerTest, getLocForStartOfToken) {
523567
const char *Source = "aaa \n \tbbb \"hello\" \"-\\(val)-\"";
524568

@@ -710,6 +754,29 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
710754
DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
711755
}
712756

757+
TEST_F(LexerTest, InvalidUTF8Bytes) {
758+
const char *Source = "\x80";
759+
760+
LangOptions LangOpts;
761+
SourceManager SourceMgr;
762+
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
763+
764+
StringCaptureDiagnosticConsumer DiagConsumer;
765+
DiagnosticEngine Diags(SourceMgr);
766+
Diags.addConsumer(DiagConsumer);
767+
768+
Lexer L(LangOpts, SourceMgr, BufferID, &Diags, LexerMode::Swift);
769+
770+
Token Tok;
771+
772+
L.lex(Tok);
773+
774+
ASSERT_EQ(DiagConsumer.messages.size(), 1);
775+
auto message = DiagConsumer.messages.front();
776+
ASSERT_TRUE(message.find("invalid UTF-8 found in source file") !=
777+
std::string::npos);
778+
}
779+
713780
#if HAS_MMAP
714781

715782
// This test requires mmap because llvm::sys::Memory doesn't support protecting

0 commit comments

Comments
 (0)