Skip to content

Commit 50d6b1f

Browse files
committed
[Parse] Add test cases for validating UTF-8 correctness
1 parent e6d8d39 commit 50d6b1f

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

unittests/Parse/LexerTests.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,50 @@ TEST_F(LexerTest, RestoreStopAtCodeCompletion) {
519519
ASSERT_EQ(tok::eof, Tok.getKind());
520520
}
521521

522+
TEST_F(LexerTest, CharactersContainTheEdgeContinuationByte) {
523+
// A continuation byte must be in the range greater than or
524+
// equal to 0x80 and less than or equal to 0xBF
525+
526+
// À(0xC3 0x80), 㗀(0xE3 0x97 0x80), 🀀(0xF0 0x9F 0x80 0x80),
527+
// ÿ(0xC3 0xBF), 俿(0xE4 0xBF 0xBF), 𐐿(0xF0 0x90 0x90 0xBF)
528+
const char *Source = "À 㗀 🀀 ÿ 俿 𐐿";
529+
530+
LangOptions LangOpts;
531+
SourceManager SourceMgr;
532+
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
533+
534+
Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift);
535+
536+
Token Tok;
537+
538+
L.lex(Tok);
539+
ASSERT_EQ(tok::identifier, Tok.getKind());
540+
ASSERT_EQ("À", Tok.getText());
541+
542+
L.lex(Tok);
543+
ASSERT_EQ(tok::identifier, Tok.getKind());
544+
ASSERT_EQ("", Tok.getText());
545+
546+
L.lex(Tok);
547+
ASSERT_EQ(tok::identifier, Tok.getKind());
548+
ASSERT_EQ("🀀", Tok.getText());
549+
550+
L.lex(Tok);
551+
ASSERT_EQ(tok::identifier, Tok.getKind());
552+
ASSERT_EQ("ÿ", Tok.getText());
553+
554+
L.lex(Tok);
555+
ASSERT_EQ(tok::identifier, Tok.getKind());
556+
ASSERT_EQ("俿", Tok.getText());
557+
558+
L.lex(Tok);
559+
ASSERT_EQ(tok::identifier, Tok.getKind());
560+
ASSERT_EQ("𐐿", Tok.getText());
561+
562+
L.lex(Tok);
563+
ASSERT_EQ(tok::eof, Tok.getKind());
564+
}
565+
522566
TEST_F(LexerTest, getLocForStartOfToken) {
523567
const char *Source = "aaa \n \tbbb \"hello\" \"-\\(val)-\"";
524568

@@ -710,6 +754,29 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
710754
DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
711755
}
712756

757+
TEST_F(LexerTest, InvalidUTF8Bytes) {
758+
const char *Source = "\x80";
759+
760+
LangOptions LangOpts;
761+
SourceManager SourceMgr;
762+
unsigned BufferID = SourceMgr.addMemBufferCopy(Source);
763+
764+
StringCaptureDiagnosticConsumer DiagConsumer;
765+
DiagnosticEngine Diags(SourceMgr);
766+
Diags.addConsumer(DiagConsumer);
767+
768+
Lexer L(LangOpts, SourceMgr, BufferID, &Diags, LexerMode::Swift);
769+
770+
Token Tok;
771+
772+
L.lex(Tok);
773+
774+
ASSERT_EQ(DiagConsumer.messages.size(), 1);
775+
auto message = DiagConsumer.messages.front();
776+
ASSERT_TRUE(message.find("invalid UTF-8 found in source file") !=
777+
std::string::npos);
778+
}
779+
713780
#if HAS_MMAP
714781

715782
// This test requires mmap because llvm::sys::Memory doesn't support protecting

0 commit comments

Comments
 (0)