@@ -519,6 +519,50 @@ TEST_F(LexerTest, RestoreStopAtCodeCompletion) {
519
519
ASSERT_EQ (tok::eof, Tok.getKind ());
520
520
}
521
521
522
+ TEST_F (LexerTest, CharactersContainTheEdgeContinuationByte) {
523
+ // A continuation byte must be in the range greater than or
524
+ // equal to 0x80 and less than or equal to 0xBF
525
+
526
+ // À(0xC3 0x80), 㗀(0xE3 0x97 0x80), 🀀(0xF0 0x9F 0x80 0x80),
527
+ // ÿ(0xC3 0xBF), 俿(0xE4 0xBF 0xBF), 𐐿(0xF0 0x90 0x90 0xBF)
528
+ const char *Source = " À 㗀 🀀 ÿ 俿 𐐿" ;
529
+
530
+ LangOptions LangOpts;
531
+ SourceManager SourceMgr;
532
+ unsigned BufferID = SourceMgr.addMemBufferCopy (Source);
533
+
534
+ Lexer L (LangOpts, SourceMgr, BufferID, /* Diags=*/ nullptr , LexerMode::Swift);
535
+
536
+ Token Tok;
537
+
538
+ L.lex (Tok);
539
+ ASSERT_EQ (tok::identifier, Tok.getKind ());
540
+ ASSERT_EQ (" À" , Tok.getText ());
541
+
542
+ L.lex (Tok);
543
+ ASSERT_EQ (tok::identifier, Tok.getKind ());
544
+ ASSERT_EQ (" 㗀" , Tok.getText ());
545
+
546
+ L.lex (Tok);
547
+ ASSERT_EQ (tok::identifier, Tok.getKind ());
548
+ ASSERT_EQ (" 🀀" , Tok.getText ());
549
+
550
+ L.lex (Tok);
551
+ ASSERT_EQ (tok::identifier, Tok.getKind ());
552
+ ASSERT_EQ (" ÿ" , Tok.getText ());
553
+
554
+ L.lex (Tok);
555
+ ASSERT_EQ (tok::identifier, Tok.getKind ());
556
+ ASSERT_EQ (" 俿" , Tok.getText ());
557
+
558
+ L.lex (Tok);
559
+ ASSERT_EQ (tok::identifier, Tok.getKind ());
560
+ ASSERT_EQ (" 𐐿" , Tok.getText ());
561
+
562
+ L.lex (Tok);
563
+ ASSERT_EQ (tok::eof, Tok.getKind ());
564
+ }
565
+
522
566
TEST_F (LexerTest, getLocForStartOfToken) {
523
567
const char *Source = " aaa \n \t bbb \" hello\" \" -\\ (val)-\" " ;
524
568
@@ -710,6 +754,29 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
710
754
DiagConsumer.messages , " 1, 4: nul character embedded in middle of file" ));
711
755
}
712
756
757
+ TEST_F (LexerTest, InvalidUTF8Bytes) {
758
+ const char *Source = " \x80 " ;
759
+
760
+ LangOptions LangOpts;
761
+ SourceManager SourceMgr;
762
+ unsigned BufferID = SourceMgr.addMemBufferCopy (Source);
763
+
764
+ StringCaptureDiagnosticConsumer DiagConsumer;
765
+ DiagnosticEngine Diags (SourceMgr);
766
+ Diags.addConsumer (DiagConsumer);
767
+
768
+ Lexer L (LangOpts, SourceMgr, BufferID, &Diags, LexerMode::Swift);
769
+
770
+ Token Tok;
771
+
772
+ L.lex (Tok);
773
+
774
+ ASSERT_EQ (DiagConsumer.messages .size (), 1 );
775
+ auto message = DiagConsumer.messages .front ();
776
+ ASSERT_TRUE (message.find (" invalid UTF-8 found in source file" ) !=
777
+ std::string::npos);
778
+ }
779
+
713
780
#if HAS_MMAP
714
781
715
782
// This test requires mmap because llvm::sys::Memory doesn't support protecting
0 commit comments