Skip to content

Commit c3574ef

Browse files
author
serge-sans-paille
committed
[clang-tidy] Confusable identifiers detection
Detect identifiers that are confusable using a variant of Unicode definition http://www.unicode.org/reports/tr39/#Confusable_Detection and have conflicting scopes. This a recommit (with portability and feature fixes) of b94db7e Differential Revision: https://reviews.llvm.org/D112916
1 parent 3c86789 commit c3574ef

File tree

11 files changed

+9958
-0
lines changed

11 files changed

+9958
-0
lines changed

clang-tools-extra/clang-tidy/misc/CMakeLists.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,27 @@ set(LLVM_LINK_COMPONENTS
33
Support
44
)
55

6+
if(LLVM_USE_HOST_TOOLS)
7+
build_native_tool(make-confusable-table make_confusable_table)
8+
set(make_confusable_table_target "${make_confusable_table}")
9+
else()
10+
set(make_confusable_table $<TARGET_FILE:make-confusable-table>)
11+
set(make_confusable_table_target make-confusable-table)
12+
endif()
13+
14+
add_subdirectory(ConfusableTable)
15+
16+
17+
add_custom_command(
18+
OUTPUT Confusables.inc
19+
COMMAND ${make_confusable_table} ${CMAKE_CURRENT_SOURCE_DIR}/ConfusableTable/confusables.txt ${CMAKE_CURRENT_BINARY_DIR}/Confusables.inc
20+
DEPENDS ${make_confusable_table_target} ConfusableTable/confusables.txt)
21+
22+
add_custom_target(genconfusable DEPENDS Confusables.inc)
23+
624
add_clang_library(clangTidyMiscModule
725
DefinitionsInHeadersCheck.cpp
26+
ConfusableIdentifierCheck.cpp
827
MiscTidyModule.cpp
928
MisleadingBidirectional.cpp
1029
MisleadingIdentifier.cpp
@@ -28,6 +47,7 @@ add_clang_library(clangTidyMiscModule
2847

2948
DEPENDS
3049
omp_gen
50+
genconfusable
3151
)
3252

3353
clang_target_link_libraries(clangTidyMiscModule
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
//===--- ConfusableIdentifierCheck.cpp -
2+
// clang-tidy--------------------------===//
3+
//
4+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
//
8+
//===----------------------------------------------------------------------===//
9+
10+
#include "ConfusableIdentifierCheck.h"
11+
12+
#include "clang/Frontend/CompilerInstance.h"
13+
#include "clang/Lex/Preprocessor.h"
14+
#include "llvm/Support/ConvertUTF.h"
15+
16+
namespace {
17+
// Preprocessed version of
18+
// https://www.unicode.org/Public/security/latest/confusables.txt
19+
//
20+
// This contains a sorted array of { UTF32 codepoint; UTF32 values[N];}
21+
#include "Confusables.inc"
22+
} // namespace
23+
24+
namespace clang {
25+
namespace tidy {
26+
namespace misc {
27+
28+
ConfusableIdentifierCheck::ConfusableIdentifierCheck(StringRef Name,
29+
ClangTidyContext *Context)
30+
: ClangTidyCheck(Name, Context) {}
31+
32+
ConfusableIdentifierCheck::~ConfusableIdentifierCheck() = default;
33+
34+
// Build a skeleton out of the Original identifier, inspired by the algorithm
35+
// described in http://www.unicode.org/reports/tr39/#def-skeleton
36+
//
37+
// FIXME: TR39 mandates:
38+
//
39+
// For an input string X, define skeleton(X) to be the following transformation
40+
// on the string:
41+
//
42+
// 1. Convert X to NFD format, as described in [UAX15].
43+
// 2. Concatenate the prototypes for each character in X according to the
44+
// specified data, producing a string of exemplar characters.
45+
// 3. Reapply NFD.
46+
//
47+
// We're skipping 1. and 3. for the sake of simplicity, but this can lead to
48+
// false positive.
49+
50+
std::string ConfusableIdentifierCheck::skeleton(StringRef Name) {
51+
using namespace llvm;
52+
std::string SName = Name.str();
53+
std::string Skeleton;
54+
Skeleton.reserve(1 + Name.size());
55+
56+
const char *Curr = SName.c_str();
57+
const char *End = Curr + SName.size();
58+
while (Curr < End) {
59+
60+
const char *Prev = Curr;
61+
UTF32 CodePoint;
62+
ConversionResult Result = convertUTF8Sequence(
63+
reinterpret_cast<const UTF8 **>(&Curr),
64+
reinterpret_cast<const UTF8 *>(End), &CodePoint, strictConversion);
65+
if (Result != conversionOK) {
66+
errs() << "Unicode conversion issue\n";
67+
break;
68+
}
69+
70+
StringRef Key(Prev, Curr - Prev);
71+
auto Where = std::lower_bound(std::begin(ConfusableEntries),
72+
std::end(ConfusableEntries), CodePoint,
73+
[](decltype(ConfusableEntries[0]) x,
74+
UTF32 y) { return x.codepoint < y; });
75+
if (Where == std::end(ConfusableEntries) || CodePoint != Where->codepoint) {
76+
Skeleton.append(Prev, Curr);
77+
} else {
78+
UTF8 Buffer[32];
79+
UTF8 *BufferStart = std::begin(Buffer);
80+
UTF8 *IBuffer = BufferStart;
81+
const UTF32 *ValuesStart = std::begin(Where->values);
82+
const UTF32 *ValuesEnd =
83+
std::find(std::begin(Where->values), std::end(Where->values), '\0');
84+
if (ConvertUTF32toUTF8(&ValuesStart, ValuesEnd, &IBuffer,
85+
std::end(Buffer),
86+
strictConversion) != conversionOK) {
87+
errs() << "Unicode conversion issue\n";
88+
break;
89+
}
90+
Skeleton.append((char *)BufferStart, (char *)IBuffer);
91+
}
92+
}
93+
return Skeleton;
94+
}
95+
96+
void ConfusableIdentifierCheck::check(
97+
const ast_matchers::MatchFinder::MatchResult &Result) {
98+
if (const auto *ND = Result.Nodes.getNodeAs<NamedDecl>("nameddecl")) {
99+
if (IdentifierInfo *II = ND->getIdentifier()) {
100+
StringRef NDName = II->getName();
101+
llvm::SmallVector<const NamedDecl *> &Mapped = Mapper[skeleton(NDName)];
102+
const DeclContext *NDDecl = ND->getDeclContext();
103+
for (const NamedDecl *OND : Mapped) {
104+
if (!NDDecl->isDeclInLexicalTraversal(OND) &&
105+
!OND->getDeclContext()->isDeclInLexicalTraversal(ND))
106+
continue;
107+
if (OND->getIdentifier()->getName() != NDName) {
108+
diag(OND->getLocation(), "%0 is confusable with %1")
109+
<< OND->getName() << NDName;
110+
diag(ND->getLocation(), "other declaration found here",
111+
DiagnosticIDs::Note);
112+
}
113+
}
114+
Mapped.push_back(ND);
115+
}
116+
}
117+
}
118+
119+
void ConfusableIdentifierCheck::registerMatchers(
120+
ast_matchers::MatchFinder *Finder) {
121+
Finder->addMatcher(ast_matchers::namedDecl().bind("nameddecl"), this);
122+
}
123+
124+
} // namespace misc
125+
} // namespace tidy
126+
} // namespace clang
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
//===--- ConfusableIdentifierCheck.h - clang-tidy
2+
//-------------------------------*- C++ -*-===//
3+
//
4+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
//
8+
//===----------------------------------------------------------------------===//
9+
10+
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
11+
#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
12+
13+
#include "../ClangTidyCheck.h"
14+
15+
namespace clang {
16+
namespace tidy {
17+
namespace misc {
18+
19+
/// Finds symbol which have confusable identifiers, i.e. identifiers that look
20+
/// the same visually but have a different Unicode representation.
21+
/// If symbols are confusable but don't live in conflicting namespaces, they are
22+
/// not reported.
23+
class ConfusableIdentifierCheck : public ClangTidyCheck {
24+
public:
25+
ConfusableIdentifierCheck(StringRef Name, ClangTidyContext *Context);
26+
~ConfusableIdentifierCheck();
27+
28+
void registerMatchers(ast_matchers::MatchFinder *Finder) override;
29+
void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
30+
31+
private:
32+
std::string skeleton(StringRef);
33+
llvm::StringMap<llvm::SmallVector<const NamedDecl *>> Mapper;
34+
};
35+
36+
} // namespace misc
37+
} // namespace tidy
38+
} // namespace clang
39+
40+
#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
//===--- BuildConfusableTable.cpp - clang-tidy---------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
#include "llvm/ADT/StringExtras.h"
9+
#include "llvm/Support/ConvertUTF.h"
10+
#include "llvm/Support/MemoryBuffer.h"
11+
#include "llvm/Support/raw_ostream.h"
12+
13+
#include <algorithm>
14+
15+
using namespace llvm;
16+
17+
int main(int argc, char *argv[]) {
18+
auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
19+
if (!ErrorOrBuffer)
20+
return 1;
21+
std::unique_ptr<MemoryBuffer> Buffer = std::move(ErrorOrBuffer.get());
22+
StringRef Content = Buffer->getBuffer();
23+
Content = Content.drop_until([](char c) { return c == '#'; });
24+
SmallVector<StringRef> Lines;
25+
SplitString(Content, Lines, "\r\n");
26+
27+
std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
28+
SmallVector<StringRef> Values;
29+
for (StringRef Line : Lines) {
30+
if (Line.startswith("#"))
31+
continue;
32+
33+
Values.clear();
34+
Line.split(Values, ';');
35+
if (Values.size() < 2) {
36+
errs() << "Failed to parse: " << Line << "\n";
37+
return 2;
38+
}
39+
40+
llvm::StringRef From = Values[0].trim();
41+
llvm::UTF32 CodePoint;
42+
From.getAsInteger(16, CodePoint);
43+
44+
SmallVector<llvm::UTF32> To;
45+
SmallVector<StringRef> ToN;
46+
Values[1].split(ToN, ' ', -1, false);
47+
for (StringRef To_ : ToN) {
48+
llvm::UTF32 ToCodePoint;
49+
To_.trim().getAsInteger(16, ToCodePoint);
50+
To.push_back(ToCodePoint);
51+
}
52+
// Sentinel
53+
To.push_back(0);
54+
55+
Entries.emplace_back(CodePoint, To);
56+
}
57+
std::sort(Entries.begin(), Entries.end());
58+
59+
unsigned LargestValue =
60+
std::max_element(Entries.begin(), Entries.end(),
61+
[](const auto &Entry0, const auto &Entry1) {
62+
return Entry0.second.size() < Entry1.second.size();
63+
})
64+
->second.size();
65+
66+
std::error_code ec;
67+
llvm::raw_fd_ostream os(argv[2], ec);
68+
69+
// FIXME: If memory consumption and/or lookup time becomes a constraint, it
70+
// maybe worth using a more elaborate data structure.
71+
os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
72+
<< "];} "
73+
"ConfusableEntries[] = {\n";
74+
for (const auto &Values : Entries) {
75+
os << " { ";
76+
os << Values.first;
77+
os << ", {";
78+
for (auto CP : Values.second)
79+
os << CP << ", ";
80+
81+
os << "}},\n";
82+
}
83+
os << "};\n";
84+
return 0;
85+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
add_llvm_executable(make-confusable-table BuildConfusableTable.cpp)

0 commit comments

Comments
 (0)