Skip to content

Commit ebd5323

Browse files
committed
[Syntax] add UTF-8 BOM support to libSyntax
1 parent c8a6ef1 commit ebd5323

File tree

3 files changed

+21
-6
lines changed

3 files changed

+21
-6
lines changed

lib/Parse/Lexer.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,9 @@ Lexer::Lexer(const LangOptions &Options,
188188
.StartsWith("\xEF\xBB\xBF", 3)
189189
.Default(0);
190190

191-
// Since the UTF-8 BOM doesn't carry information (UTF-8 has no dependency
192-
// on byte order), throw it away.
193-
CurPtr = BufferStart + BOMLength;
191+
// Keep information about existance of UTF-8 BOM for transparency source code
192+
// editing with libSyntax.
193+
CurPtr = BufferStart;
194194
ContentStart = BufferStart + BOMLength;
195195

196196
// Initialize code completion.
@@ -2036,7 +2036,20 @@ void Lexer::lexImpl() {
20362036
LeadingTrivia.clear();
20372037
TrailingTrivia.clear();
20382038
}
2039-
NextToken.setAtStartOfLine(CurPtr == ContentStart);
2039+
if (CurPtr == BufferStart) {
2040+
if (BufferStart < ContentStart) {
2041+
size_t BOMLen = ContentStart - BufferStart;
2042+
assert(BOMLen == 3 && "UTF-8 BOM is 3 bytes");
2043+
if (TriviaRetention == TriviaRetentionMode::WithTrivia) {
2044+
// Add UTF-8 BOM to LeadingTrivia.
2045+
LeadingTrivia.push_back(TriviaPiece::garbageText({CurPtr, BOMLen}));
2046+
}
2047+
CurPtr += BOMLen;
2048+
}
2049+
NextToken.setAtStartOfLine(true);
2050+
} else {
2051+
NextToken.setAtStartOfLine(false);
2052+
}
20402053

20412054
// Remember where we started so that we can find the comment range.
20422055
LastCommentBlockStart = CurPtr;

unittests/Parse/LexerTests.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,7 @@ TEST_F(LexerTest, BOMNoCommentTrivia) {
400400
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
401401
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
402402
ASSERT_EQ((syntax::Trivia{{
403+
syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
403404
syntax::TriviaPiece::lineComment("// comment"),
404405
syntax::TriviaPiece::newlines(1)
405406
}}), LeadingTrivia);
@@ -440,6 +441,7 @@ TEST_F(LexerTest, BOMAttachCommentTrivia) {
440441
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
441442
ASSERT_EQ(10u, Tok.getCommentRange().getByteLength());
442443
ASSERT_EQ((syntax::Trivia{{
444+
syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
443445
syntax::TriviaPiece::lineComment("// comment"),
444446
syntax::TriviaPiece::newlines(1)
445447
}}), LeadingTrivia);

unittests/Parse/LexerTriviaTests.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,9 @@ TEST_F(LexerTriviaTest, TriviaHashbangAfterBOM) {
102102
ASSERT_EQ("aaa", Tok.getText());
103103
ASSERT_TRUE(Tok.isAtStartOfLine());
104104

105-
// FIXME: This should include UTF8-BOM as a GarbargeText trivia.
106105
ASSERT_EQ(LeadingTrivia,
107-
(Trivia{{TriviaPiece::garbageText("#!/bin/swift"),
106+
(Trivia{{TriviaPiece::garbageText("\xEF\xBB\xBF"),
107+
TriviaPiece::garbageText("#!/bin/swift"),
108108
TriviaPiece::newlines(1)}}));
109109
}
110110

0 commit comments

Comments
 (0)