Skip to content

Commit 216af81

Browse files
committed
[clangd] Fix invalid UTF8 when extracting doc comments.
Differential Revision: https://reviews.llvm.org/D88567
1 parent c722b32 commit 216af81

File tree

3 files changed

+18
-3
lines changed

3 files changed

+18
-3
lines changed

clang-tools-extra/clangd/CodeCompletionStrings.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "clang/AST/RawCommentList.h"
1313
#include "clang/Basic/SourceManager.h"
1414
#include "clang/Sema/CodeCompleteConsumer.h"
15+
#include "llvm/Support/JSON.h"
1516
#include <limits>
1617
#include <utility>
1718

@@ -86,7 +87,12 @@ std::string getDeclComment(const ASTContext &Ctx, const NamedDecl &Decl) {
8687
assert(!Ctx.getSourceManager().isLoadedSourceLocation(RC->getBeginLoc()));
8788
std::string Doc =
8889
RC->getFormattedText(Ctx.getSourceManager(), Ctx.getDiagnostics());
89-
return looksLikeDocComment(Doc) ? Doc : "";
90+
if (!looksLikeDocComment(Doc))
91+
return "";
92+
// Clang requires source to be UTF-8, but doesn't enforce this in comments.
93+
if (!llvm::json::isUTF8(Doc))
94+
Doc = llvm::json::fixUTF8(Doc);
95+
return Doc;
9096
}
9197

9298
void getSignature(const CodeCompletionString &CCS, std::string *Signature,

clang-tools-extra/clangd/unittests/CodeCompletionStringsTests.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "CodeCompletionStrings.h"
10+
#include "TestTU.h"
1011
#include "clang/Sema/CodeCompleteConsumer.h"
1112
#include "gmock/gmock.h"
1213
#include "gtest/gtest.h"
@@ -56,6 +57,14 @@ TEST_F(CompletionStringTest, DocumentationWithAnnotation) {
5657
"Annotation: Ano\n\nIs this brief?");
5758
}
5859

60+
TEST_F(CompletionStringTest, GetDeclCommentBadUTF8) {
61+
// <ff> is not a valid byte here, should be replaced by encoded <U+FFFD>.
62+
auto TU = TestTU::withCode("/*x\xffy*/ struct X;");
63+
auto AST = TU.build();
64+
EXPECT_EQ("x\xef\xbf\xbdy",
65+
getDeclComment(AST.getASTContext(), findDecl(AST, "X")));
66+
}
67+
5968
TEST_F(CompletionStringTest, MultipleAnnotations) {
6069
Builder.AddAnnotation("Ano1");
6170
Builder.AddAnnotation("Ano2");

clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1606,11 +1606,11 @@ TEST_F(SymbolCollectorTest, BadUTF8) {
16061606
// Extracted from boost/spirit/home/support/char_encoding/iso8859_1.hpp
16071607
// This looks like UTF-8 and fools clang, but has high-ISO-8859-1 comments.
16081608
const char *Header = "int PUNCT = 0;\n"
1609-
"int types[] = { /* \xa1 */PUNCT };";
1609+
"/* \xa1 */ int types[] = { /* \xa1 */PUNCT };";
16101610
CollectorOpts.RefFilter = RefKind::All;
16111611
CollectorOpts.RefsInHeaders = true;
16121612
runSymbolCollector(Header, "");
1613-
EXPECT_THAT(Symbols, Contains(QName("types")));
1613+
EXPECT_THAT(Symbols, Contains(AllOf(QName("types"), Doc("\xef\xbf\xbd "))));
16141614
EXPECT_THAT(Symbols, Contains(QName("PUNCT")));
16151615
// Reference is stored, although offset within line is not reliable.
16161616
EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "PUNCT").ID, _)));

0 commit comments

Comments
 (0)