Skip to content

Commit b94db7e

Browse files
author
serge-sans-paille
committed
[clang-tidy] Confusable identifiers detection
Detect identifiers that are confusable according to Unicode definition http://www.unicode.org/reports/tr39/#Confusable_Detection and have conflicting scopes. Differential Revision: https://reviews.llvm.org/D112916
1 parent 8b18572 commit b94db7e

File tree

11 files changed

+9906
-0
lines changed

11 files changed

+9906
-0
lines changed

clang-tools-extra/clang-tidy/misc/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,18 @@ set(LLVM_LINK_COMPONENTS
33
Support
44
)
55

6+
add_subdirectory(ConfusableTable)
7+
8+
add_custom_command(
9+
OUTPUT Confusables.inc
10+
COMMAND make_confusable_table ${CMAKE_CURRENT_SOURCE_DIR}/ConfusableTable/confusables.txt ${CMAKE_CURRENT_BINARY_DIR}/Confusables.inc
11+
DEPENDS make_confusable_table ConfusableTable/confusables.txt)
12+
13+
add_custom_target(genconfusable DEPENDS Confusables.inc)
14+
615
add_clang_library(clangTidyMiscModule
716
DefinitionsInHeadersCheck.cpp
17+
Homoglyph.cpp
818
MiscTidyModule.cpp
919
MisleadingBidirectional.cpp
1020
MisleadingIdentifier.cpp
@@ -28,6 +38,7 @@ add_clang_library(clangTidyMiscModule
2838

2939
DEPENDS
3040
omp_gen
41+
genconfusable
3142
)
3243

3344
clang_target_link_libraries(clangTidyMiscModule
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
add_llvm_executable(make_confusable_table build_confusable_table.cpp)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
//===--- build_confusable_table.cpp - clang-tidy---------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
#include "llvm/ADT/StringExtras.h"
9+
#include "llvm/Support/ConvertUTF.h"
10+
#include "llvm/Support/MemoryBuffer.h"
11+
#include "llvm/Support/raw_ostream.h"
12+
13+
using namespace llvm;
14+
15+
int main(int argc, char *argv[]) {
16+
auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
17+
if (!ErrorOrBuffer)
18+
return 1;
19+
std::unique_ptr<MemoryBuffer> Buffer = std::move(ErrorOrBuffer.get());
20+
StringRef Content = Buffer->getBuffer();
21+
Content = Content.drop_until([](char c) { return c == '#'; });
22+
SmallVector<StringRef> Lines;
23+
SplitString(Content, Lines, "\r\n");
24+
25+
std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
26+
for (StringRef Line : Lines) {
27+
if (Line.startswith("#"))
28+
continue;
29+
30+
SmallVector<StringRef> Values;
31+
Line.split(Values, ';');
32+
if (Values.size() < 2) {
33+
errs() << "Failed to parse: " << Line << "\n";
34+
return 2;
35+
}
36+
37+
llvm::StringRef From = Values[0].trim();
38+
llvm::UTF32 CodePoint;
39+
From.getAsInteger(16, CodePoint);
40+
41+
SmallVector<llvm::UTF32> To;
42+
SmallVector<StringRef> ToN;
43+
Values[1].split(ToN, ' ', -1, false);
44+
for (StringRef To_ : ToN) {
45+
llvm::UTF32 ToCodePoint;
46+
To_.trim().getAsInteger(16, ToCodePoint);
47+
To.push_back(ToCodePoint);
48+
}
49+
while (To.size() < 32)
50+
To.push_back(0);
51+
52+
Entries.emplace_back(CodePoint, To);
53+
}
54+
std::sort(Entries.begin(), Entries.end());
55+
errs() << "Parsed " << Entries.size() << " Entries\n";
56+
57+
std::error_code ec;
58+
llvm::raw_fd_ostream os(argv[2], ec);
59+
os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[32];} "
60+
"ConfusableEntries[] = {\n";
61+
for (auto const &Values : Entries) {
62+
os << " { ";
63+
os << Values.first;
64+
os << ", {";
65+
for (auto CP : Values.second) {
66+
os << CP << ", ";
67+
}
68+
os << "}},\n";
69+
}
70+
os << "};\n";
71+
return 0;
72+
}

clang-tools-extra/clang-tidy/misc/ConfusableTable/confusables.txt

Lines changed: 9638 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
//===--- MisleadingBidirectional.cpp - clang-tidy--------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "Homoglyph.h"
10+
11+
#include "clang/Frontend/CompilerInstance.h"
12+
#include "clang/Lex/Preprocessor.h"
13+
#include "llvm/Support/ConvertUTF.h"
14+
15+
namespace {
16+
// Preprocessed version of
17+
// https://www.unicode.org/Public/security/latest/confusables.txt
18+
//
19+
// This contains a sorted array of { UTF32 codepoint; UTF32 values[N];}
20+
#include "Confusables.inc"
21+
} // namespace
22+
23+
namespace clang {
24+
namespace tidy {
25+
namespace misc {
26+
27+
Homoglyph::Homoglyph(StringRef Name, ClangTidyContext *Context)
28+
: ClangTidyCheck(Name, Context) {}
29+
30+
Homoglyph::~Homoglyph() = default;
31+
32+
/**
33+
* Build a skeleton out of the Original identifier, following the algorithm
34+
* described in http://www.unicode.org/reports/tr39/#def-skeleton
35+
*/
36+
std::string Homoglyph::skeleton(StringRef Name) {
37+
std::string SName = Name.str();
38+
std::string Skeleton;
39+
Skeleton.reserve(1 + Name.size());
40+
41+
char const *Curr = SName.c_str();
42+
char const *End = Curr + SName.size();
43+
while (Curr < End) {
44+
45+
char const *Prev = Curr;
46+
llvm::UTF32 CodePoint;
47+
llvm::ConversionResult Result = llvm::convertUTF8Sequence(
48+
(const llvm::UTF8 **)&Curr, (const llvm::UTF8 *)End, &CodePoint,
49+
llvm::strictConversion);
50+
if (Result != llvm::conversionOK) {
51+
llvm::errs() << "Unicode conversion issue\n";
52+
break;
53+
}
54+
55+
StringRef Key(Prev, Curr - Prev);
56+
auto Where = std::lower_bound(
57+
std::begin(ConfusableEntries), std::end(ConfusableEntries), CodePoint,
58+
[](decltype(ConfusableEntries[0]) x, llvm::UTF32 y) {
59+
return x.codepoint < y;
60+
});
61+
if (Where == std::end(ConfusableEntries) || CodePoint != Where->codepoint) {
62+
Skeleton.append(Prev, Curr);
63+
} else {
64+
llvm::UTF8 Buffer[32];
65+
llvm::UTF8 *BufferStart = std::begin(Buffer);
66+
llvm::UTF8 *IBuffer = BufferStart;
67+
const llvm::UTF32 *ValuesStart = std::begin(Where->values);
68+
const llvm::UTF32 *ValuesEnd =
69+
std::find(std::begin(Where->values), std::end(Where->values), '\0');
70+
if (llvm::ConvertUTF32toUTF8(&ValuesStart, ValuesEnd, &IBuffer,
71+
std::end(Buffer), llvm::strictConversion) !=
72+
llvm::conversionOK) {
73+
llvm::errs() << "Unicode conversion issue\n";
74+
break;
75+
}
76+
Skeleton.append((char *)BufferStart, (char *)IBuffer);
77+
}
78+
}
79+
return Skeleton;
80+
}
81+
82+
void Homoglyph::check(const ast_matchers::MatchFinder::MatchResult &Result) {
83+
if (const auto *ND = Result.Nodes.getNodeAs<NamedDecl>("nameddecl")) {
84+
StringRef NDName = ND->getName();
85+
auto &Mapped = Mapper[skeleton(NDName)];
86+
auto *NDDecl = ND->getDeclContext();
87+
for (auto *OND : Mapped) {
88+
if (!NDDecl->isDeclInLexicalTraversal(OND) &&
89+
!OND->getDeclContext()->isDeclInLexicalTraversal(ND))
90+
continue;
91+
if (OND->getName() != NDName) {
92+
diag(OND->getLocation(), "%0 is confusable with %1")
93+
<< OND->getName() << NDName;
94+
diag(ND->getLocation(), "other definition found here",
95+
DiagnosticIDs::Note);
96+
}
97+
}
98+
Mapped.push_back(ND);
99+
}
100+
}
101+
102+
void Homoglyph::registerMatchers(ast_matchers::MatchFinder *Finder) {
103+
Finder->addMatcher(ast_matchers::namedDecl().bind("nameddecl"), this);
104+
}
105+
106+
} // namespace misc
107+
} // namespace tidy
108+
} // namespace clang
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
//===--- Homoglyph.h - clang-tidy -------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_HOMOGLYPH_H
10+
#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_HOMOGLYPH_H
11+
12+
#include "../ClangTidyCheck.h"
13+
14+
namespace clang {
15+
namespace tidy {
16+
namespace misc {
17+
18+
class Homoglyph : public ClangTidyCheck {
19+
public:
20+
Homoglyph(StringRef Name, ClangTidyContext *Context);
21+
~Homoglyph();
22+
23+
void registerMatchers(ast_matchers::MatchFinder *Finder) override;
24+
void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
25+
26+
private:
27+
std::string skeleton(StringRef);
28+
llvm::StringMap<llvm::SmallVector<NamedDecl const *>> Mapper;
29+
};
30+
31+
} // namespace misc
32+
} // namespace tidy
33+
} // namespace clang
34+
35+
#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_HOMOGLYPH_H

clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "../ClangTidyModule.h"
1111
#include "../ClangTidyModuleRegistry.h"
1212
#include "DefinitionsInHeadersCheck.h"
13+
#include "Homoglyph.h"
1314
#include "MisleadingBidirectional.h"
1415
#include "MisleadingIdentifier.h"
1516
#include "MisplacedConstCheck.h"
@@ -37,6 +38,7 @@ class MiscModule : public ClangTidyModule {
3738
"misc-definitions-in-headers");
3839
CheckFactories.registerCheck<MisleadingBidirectionalCheck>(
3940
"misc-misleading-bidirectional");
41+
CheckFactories.registerCheck<Homoglyph>("misc-homoglyph");
4042
CheckFactories.registerCheck<MisleadingIdentifierCheck>(
4143
"misc-misleading-identifier");
4244
CheckFactories.registerCheck<MisplacedConstCheck>("misc-misplaced-const");

clang-tools-extra/docs/ReleaseNotes.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,10 @@ New checks
136136
Future libc++ will remove the extension (`D120996
137137
<https://reviews.llvm.org/D120996>`).
138138

139+
- New :doc:`misc-homoglyph <clang-tidy/checks/misc-homoglyph>` check.
140+
141+
Detects confusable unicode identifiers.
142+
139143
New check aliases
140144
^^^^^^^^^^^^^^^^^
141145

clang-tools-extra/docs/clang-tidy/checks/list.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ Clang-Tidy Checks
214214
`llvmlibc-implementation-in-namespace <llvmlibc-implementation-in-namespace.html>`_,
215215
`llvmlibc-restrict-system-libc-headers <llvmlibc-restrict-system-libc-headers.html>`_, "Yes"
216216
`misc-definitions-in-headers <misc-definitions-in-headers.html>`_, "Yes"
217+
`misc-homoglyph <misc-homoglyph.html>`_,
217218
`misc-misleading-bidirectional <misc-misleading-bidirectional.html>`_,
218219
`misc-misleading-identifier <misc-misleading-identifier.html>`_,
219220
`misc-misplaced-const <misc-misplaced-const.html>`_,
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
.. title:: clang-tidy - misc-homoglyph
2+
3+
misc-homoglyph
4+
==============
5+
6+
Warn about confusable identifiers, i.e. identifiers that are visually close to
7+
each other, but use different unicode characters. This detetcs potential attack
8+
as described in `Trojan Source <https://www.trojansource.codes>`_.
9+
10+
Example:
11+
12+
.. code-block:: c++
13+
14+
int fo;
15+
int 𝐟o;
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// RUN: %check_clang_tidy %s misc-homoglyph %t
2+
3+
int fo;
4+
// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: fo is confusable with 𝐟o [misc-homoglyph]
5+
int 𝐟o;
6+
// CHECK-MESSAGES: :[[#@LINE-1]]:5: note: other definition found here
7+
8+
void no() {
9+
int 𝐟oo;
10+
}
11+
12+
void worry() {
13+
int foo;
14+
}
15+
16+
int 𝐟i;
17+
// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 𝐟i is confusable with fi [misc-homoglyph]
18+
int fi;
19+
// CHECK-MESSAGES: :[[#@LINE-1]]:5: note: other definition found here

0 commit comments

Comments
 (0)