Skip to content

Commit dfdedaf

Browse files
authored
[analyzer] Switch to PostStmt callbacks in ArrayBoundV2 (#72107)
...instead of the currently used, more abstract Location callback. The main advantage of this change is that after it the checker will check `array[index].field` while the previous implementation ignored this situation (because here the ElementRegion is wrapped in a FieldRegion object). This improvement fixes PR #70187. Note that after this change `&array[idx]` will be handled as an access to the `idx`th element of `array`, which is technically incorrect but matches the programmer intuitions. In my opinion it's more helpful if the report points to the source location where the indexing happens (instead of the location where a pointer is finally dereferenced). As a special case, this change allows code that forms the past-the-end pointer of an array as `&arr[size]` (but still rejects code like `if (idx >= size) return &array[idx];` and code that dereferences a past-the-end pointer). In addition to this primary improvement, this change tweaks the message for the tainted index/offset case (using the more concrete information that's available now) and clarifies/improves a few testcases. The main change of this commit (replacing `check::Location` with `check::PostStmt<...>` callbacks) was already proposed in my change https://reviews.llvm.org/D150446 and https://reviews.llvm.org/D159107 by steakhal. Those reviews were both abandoned, but the problems that led to abandonment were unrelated to the change that is introduced in this PR.
1 parent 9e4210f commit dfdedaf

File tree

4 files changed

+235
-47
lines changed

4 files changed

+235
-47
lines changed

clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp

Lines changed: 101 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "clang/AST/CharUnits.h"
15+
#include "clang/AST/ParentMapContext.h"
1516
#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
1617
#include "clang/StaticAnalyzer/Checkers/Taint.h"
1718
#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
@@ -34,20 +35,46 @@ using llvm::formatv;
3435
namespace {
3536
enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
3637

37-
class ArrayBoundCheckerV2 :
38-
public Checker<check::Location> {
38+
struct Messages {
39+
std::string Short, Full;
40+
};
41+
42+
// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
43+
// instead of `PreStmt` because the current implementation passes the whole
44+
// expression to `CheckerContext::getSVal()` which only works after the
45+
// symbolic evaluation of the expression. (To turn them into `PreStmt`
46+
// callbacks, we'd need to duplicate the logic that evaluates these
47+
// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
48+
// defined as `PostStmt` for the sake of consistency with the other callbacks.
49+
class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
50+
check::PostStmt<UnaryOperator>,
51+
check::PostStmt<MemberExpr>> {
3952
BugType BT{this, "Out-of-bound access"};
4053
BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
4154

55+
void performCheck(const Expr *E, CheckerContext &C) const;
56+
4257
void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
43-
NonLoc Offset, std::string RegName, std::string Msg) const;
58+
NonLoc Offset, Messages Msgs) const;
4459

4560
static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
4661

62+
static bool isInAddressOf(const Stmt *S, ASTContext &AC);
63+
4764
public:
48-
void checkLocation(SVal l, bool isLoad, const Stmt *S,
49-
CheckerContext &C) const;
65+
void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
66+
performCheck(E, C);
67+
}
68+
void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
69+
if (E->getOpcode() == UO_Deref)
70+
performCheck(E, C);
71+
}
72+
void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
73+
if (E->isArrow())
74+
performCheck(E->getBase(), C);
75+
}
5076
};
77+
5178
} // anonymous namespace
5279

5380
/// For a given Location that can be represented as a symbolic expression
@@ -149,9 +176,11 @@ getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
149176
// where the first one corresponds to "value below threshold" and the second
150177
// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
151178
// the case when the evaluation fails.
179+
// If the optional argument CheckEquality is true, then use BO_EQ instead of
180+
// the default BO_LT after consistently applying the same simplification steps.
152181
static std::pair<ProgramStateRef, ProgramStateRef>
153182
compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
154-
SValBuilder &SVB) {
183+
SValBuilder &SVB, bool CheckEquality = false) {
155184
if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
156185
std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
157186
}
@@ -167,8 +196,10 @@ compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
167196
return {nullptr, State};
168197
}
169198
}
199+
const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
170200
auto BelowThreshold =
171-
SVB.evalBinOpNN(State, BO_LT, Value, Threshold, SVB.getConditionType()).getAs<NonLoc>();
201+
SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
202+
.getAs<NonLoc>();
172203

173204
if (BelowThreshold)
174205
return State->assume(*BelowThreshold);
@@ -217,16 +248,19 @@ static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
217248
return formatv(ShortMsgTemplates[Kind], RegName);
218249
}
219250

220-
static std::string getPrecedesMsg(std::string RegName, NonLoc Offset) {
251+
static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
252+
std::string RegName = getRegionName(Region);
221253
SmallString<128> Buf;
222254
llvm::raw_svector_ostream Out(Buf);
223255
Out << "Access of " << RegName << " at negative byte offset";
224256
if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
225257
Out << ' ' << ConcreteIdx->getValue();
226-
return std::string(Buf);
258+
return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)};
227259
}
228-
static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
229-
NonLoc Offset, NonLoc Extent, SVal Location) {
260+
261+
static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
262+
NonLoc Offset, NonLoc Extent, SVal Location) {
263+
std::string RegName = getRegionName(Region);
230264
const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
231265
assert(EReg && "this checker only handles element access");
232266
QualType ElemType = EReg->getElementType();
@@ -273,20 +307,18 @@ static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
273307
Out << "s";
274308
}
275309

276-
return std::string(Buf);
277-
}
278-
static std::string getTaintMsg(std::string RegName) {
279-
SmallString<128> Buf;
280-
llvm::raw_svector_ostream Out(Buf);
281-
Out << "Access of " << RegName
282-
<< " with a tainted offset that may be too large";
283-
return std::string(Buf);
310+
return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)};
284311
}
285312

286-
void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
287-
const Stmt *LoadS,
288-
CheckerContext &C) const {
313+
static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
314+
std::string RegName = getRegionName(Region);
315+
return {formatv("Potential out of bound access to {0} with tainted {1}",
316+
RegName, OffsetName),
317+
formatv("Access of {0} with a tainted {1} that may be too large",
318+
RegName, OffsetName)};
319+
}
289320

321+
void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
290322
// NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
291323
// some new logic here that reasons directly about memory region extents.
292324
// Once that logic is more mature, we can bring it back to assumeInBound()
@@ -297,12 +329,14 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
297329
// have some flexibility in defining the base region, we can achieve
298330
// various levels of conservatism in our buffer overflow checking.
299331

332+
const SVal Location = C.getSVal(E);
333+
300334
// The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
301335
// #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
302336
// and incomplete analysis of these leads to false positives. As even
303337
// accurate reports would be confusing for the users, just disable reports
304338
// from these macros:
305-
if (isFromCtypeMacro(LoadS, C.getASTContext()))
339+
if (isFromCtypeMacro(E, C.getASTContext()))
306340
return;
307341

308342
ProgramStateRef State = C.getState();
@@ -331,9 +365,8 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
331365

332366
if (PrecedesLowerBound && !WithinLowerBound) {
333367
// We know that the index definitely precedes the lower bound.
334-
std::string RegName = getRegionName(Reg);
335-
std::string Msg = getPrecedesMsg(RegName, ByteOffset);
336-
reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, RegName, Msg);
368+
Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
369+
reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs);
337370
return;
338371
}
339372

@@ -350,17 +383,38 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
350383
if (ExceedsUpperBound) {
351384
if (!WithinUpperBound) {
352385
// We know that the index definitely exceeds the upper bound.
353-
std::string RegName = getRegionName(Reg);
354-
std::string Msg = getExceedsMsg(C.getASTContext(), RegName, ByteOffset,
355-
*KnownSize, Location);
356-
reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, RegName, Msg);
386+
if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
387+
// ...but this is within an addressof expression, so we need to check
388+
// for the exceptional case that `&array[size]` is valid.
389+
auto [EqualsToThreshold, NotEqualToThreshold] =
390+
compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize,
391+
SVB, /*CheckEquality=*/true);
392+
if (EqualsToThreshold && !NotEqualToThreshold) {
393+
// We are definitely in the exceptional case, so return early
394+
// instead of reporting a bug.
395+
C.addTransition(EqualsToThreshold);
396+
return;
397+
}
398+
}
399+
Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
400+
*KnownSize, Location);
401+
reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
357402
return;
358403
}
359404
if (isTainted(State, ByteOffset)) {
360-
// Both cases are possible, but the index is tainted, so report.
405+
// Both cases are possible, but the offset is tainted, so report.
361406
std::string RegName = getRegionName(Reg);
362-
std::string Msg = getTaintMsg(RegName);
363-
reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, RegName, Msg);
407+
408+
// Diagnostic detail: "tainted offset" is always correct, but the
409+
// common case is that 'idx' is tainted in 'arr[idx]' and then it's
410+
// nicer to say "tainted index".
411+
const char *OffsetName = "offset";
412+
if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
413+
if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
414+
OffsetName = "index";
415+
416+
Messages Msgs = getTaintMsgs(Reg, OffsetName);
417+
reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs);
364418
return;
365419
}
366420
}
@@ -374,17 +428,14 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
374428

375429
void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
376430
ProgramStateRef ErrorState, OOB_Kind Kind,
377-
NonLoc Offset, std::string RegName,
378-
std::string Msg) const {
431+
NonLoc Offset, Messages Msgs) const {
379432

380433
ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
381434
if (!ErrorNode)
382435
return;
383436

384-
std::string ShortMsg = getShortMsg(Kind, RegName);
385-
386437
auto BR = std::make_unique<PathSensitiveBugReport>(
387-
Kind == OOB_Taint ? TaintBT : BT, ShortMsg, Msg, ErrorNode);
438+
Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
388439

389440
// Track back the propagation of taintedness.
390441
if (Kind == OOB_Taint)
@@ -413,6 +464,18 @@ bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
413464
(MacroName == "isupper") || (MacroName == "isxdigit"));
414465
}
415466

467+
bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
468+
ParentMapContext &ParentCtx = ACtx.getParentMapContext();
469+
do {
470+
const DynTypedNodeList Parents = ParentCtx.getParents(*S);
471+
if (Parents.empty())
472+
return false;
473+
S = Parents[0].get<Stmt>();
474+
} while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
475+
const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
476+
return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
477+
}
478+
416479
void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
417480
mgr.registerChecker<ArrayBoundCheckerV2>();
418481
}

0 commit comments

Comments
 (0)