Skip to content

Commit c95abe9

Browse files
authored
[Offload] Implement double free (and other allocation error) reporting (#100261)
As a first step towards a GPU sanitizer we now can track allocations and deallocations in order to report double frees, and other problems during deallocation.
1 parent 2acf77f commit c95abe9

File tree

10 files changed

+527
-1
lines changed

10 files changed

+527
-1
lines changed
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
//===- ErrorReporting.h - Helper to provide nice error messages ----- c++ -===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
12+
#define OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
13+
14+
#include "PluginInterface.h"
15+
#include "Shared/EnvironmentVar.h"
16+
17+
#include "llvm/ADT/SmallString.h"
18+
#include "llvm/ADT/StringRef.h"
19+
#include "llvm/Support/ErrorHandling.h"
20+
#include "llvm/Support/WithColor.h"
21+
#include "llvm/Support/raw_ostream.h"
22+
23+
#include <cstdint>
24+
#include <cstdio>
25+
#include <cstdlib>
26+
#include <functional>
27+
#include <optional>
28+
#include <string>
29+
#include <unistd.h>
30+
31+
namespace llvm {
32+
namespace omp {
33+
namespace target {
34+
namespace plugin {
35+
36+
class ErrorReporter {
37+
38+
enum ColorTy {
39+
Yellow = int(HighlightColor::Address),
40+
Green = int(HighlightColor::String),
41+
DarkBlue = int(HighlightColor::Tag),
42+
Cyan = int(HighlightColor::Attribute),
43+
DarkPurple = int(HighlightColor::Enumerator),
44+
DarkRed = int(HighlightColor::Macro),
45+
BoldRed = int(HighlightColor::Error),
46+
BoldLightPurple = int(HighlightColor::Warning),
47+
BoldDarkGrey = int(HighlightColor::Note),
48+
BoldLightBlue = int(HighlightColor::Remark),
49+
};
50+
51+
/// The banner printed at the beginning of an error report.
52+
static constexpr auto ErrorBanner = "OFFLOAD ERROR: ";
53+
54+
/// Return the device id as string, or n/a if not available.
55+
static std::string getDeviceIdStr(GenericDeviceTy *Device) {
56+
return Device ? std::to_string(Device->getDeviceId()) : "n/a";
57+
}
58+
59+
/// Return a nice name for an TargetAllocTy.
60+
static StringRef getAllocTyName(TargetAllocTy Kind) {
61+
switch (Kind) {
62+
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
63+
case TARGET_ALLOC_DEFAULT:
64+
case TARGET_ALLOC_DEVICE:
65+
return "device memory";
66+
case TARGET_ALLOC_HOST:
67+
return "pinned host memory";
68+
case TARGET_ALLOC_SHARED:
69+
return "managed memory";
70+
break;
71+
}
72+
llvm_unreachable("Unknown target alloc kind");
73+
}
74+
75+
#pragma clang diagnostic push
76+
#pragma clang diagnostic ignored "-Wgcc-compat"
77+
#pragma clang diagnostic ignored "-Wformat-security"
78+
/// Print \p Format, instantiated with \p Args to stderr.
79+
/// TODO: Allow redirection into a file stream.
80+
template <typename... ArgsTy>
81+
[[gnu::format(__printf__, 1, 2)]] static void print(const char *Format,
82+
ArgsTy &&...Args) {
83+
raw_fd_ostream OS(STDERR_FILENO, false);
84+
OS << llvm::format(Format, Args...);
85+
}
86+
87+
/// Print \p Format, instantiated with \p Args to stderr, but colored.
88+
/// TODO: Allow redirection into a file stream.
89+
template <typename... ArgsTy>
90+
[[gnu::format(__printf__, 2, 3)]] static void
91+
print(ColorTy Color, const char *Format, ArgsTy &&...Args) {
92+
raw_fd_ostream OS(STDERR_FILENO, false);
93+
WithColor(OS, HighlightColor(Color)) << llvm::format(Format, Args...);
94+
}
95+
96+
/// Print \p Format, instantiated with \p Args to stderr, but colored and with
97+
/// a banner.
98+
/// TODO: Allow redirection into a file stream.
99+
template <typename... ArgsTy>
100+
[[gnu::format(__printf__, 1, 2)]] static void reportError(const char *Format,
101+
ArgsTy &&...Args) {
102+
print(BoldRed, "%s", ErrorBanner);
103+
print(BoldRed, Format, Args...);
104+
print("\n");
105+
}
106+
#pragma clang diagnostic pop
107+
108+
static void reportError(const char *Str) { reportError("%s", Str); }
109+
static void print(const char *Str) { print("%s", Str); }
110+
static void print(StringRef Str) { print("%s", Str.str().c_str()); }
111+
static void print(ColorTy Color, const char *Str) { print(Color, "%s", Str); }
112+
static void print(ColorTy Color, StringRef Str) {
113+
print(Color, "%s", Str.str().c_str());
114+
}
115+
116+
/// Pretty print a stack trace.
117+
static void reportStackTrace(StringRef StackTrace) {
118+
if (StackTrace.empty())
119+
return;
120+
121+
SmallVector<StringRef> Lines, Parts;
122+
StackTrace.split(Lines, "\n", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
123+
int Start = Lines.empty() || !Lines[0].contains("PrintStackTrace") ? 0 : 1;
124+
unsigned NumDigits =
125+
(int)(floor(log10(Lines.size() - Start - /*0*/ 1)) + 1);
126+
for (int I = Start, E = Lines.size(); I < E; ++I) {
127+
auto Line = Lines[I];
128+
Parts.clear();
129+
Line = Line.drop_while([](char C) { return std::isspace(C); });
130+
Line.split(Parts, " ", /*MaxSplit=*/2);
131+
if (Parts.size() != 3 || Parts[0].size() < 2 || Parts[0][0] != '#') {
132+
print("%s\n", Line.str().c_str());
133+
continue;
134+
}
135+
unsigned FrameIdx = std::stoi(Parts[0].drop_front(1).str());
136+
if (Start)
137+
FrameIdx -= 1;
138+
print(DarkPurple, " %s", Parts[0].take_front().str().c_str());
139+
print(Green, "%*u", NumDigits, FrameIdx);
140+
print(BoldLightBlue, " %s", Parts[1].str().c_str());
141+
print(" %s\n", Parts[2].str().c_str());
142+
}
143+
print("\n");
144+
}
145+
146+
/// Report information about an allocation associated with \p ATI.
147+
static void reportAllocationInfo(AllocationTraceInfoTy *ATI) {
148+
if (!ATI)
149+
return;
150+
151+
if (!ATI->DeallocationTrace.empty()) {
152+
print(BoldLightPurple, "Last deallocation:\n");
153+
reportStackTrace(ATI->DeallocationTrace);
154+
}
155+
156+
if (ATI->HostPtr)
157+
print(BoldLightPurple,
158+
"Last allocation of size %lu for host pointer %p:\n", ATI->Size,
159+
ATI->HostPtr);
160+
else
161+
print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size);
162+
reportStackTrace(ATI->AllocationTrace);
163+
if (!ATI->LastAllocationInfo)
164+
return;
165+
166+
unsigned I = 0;
167+
print(BoldLightPurple, "Prior allocations with the same base pointer:");
168+
while (ATI->LastAllocationInfo) {
169+
print("\n");
170+
ATI = ATI->LastAllocationInfo;
171+
print(BoldLightPurple, " #%u Prior deallocation of size %lu:\n", I,
172+
ATI->Size);
173+
reportStackTrace(ATI->DeallocationTrace);
174+
if (ATI->HostPtr)
175+
print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n",
176+
I, ATI->HostPtr);
177+
else
178+
print(BoldLightPurple, " #%u Prior allocation:\n", I);
179+
reportStackTrace(ATI->AllocationTrace);
180+
++I;
181+
}
182+
}
183+
184+
/// End the execution of the program.
185+
static void abortExecution() { abort(); }
186+
187+
public:
188+
#define DEALLOCATION_ERROR(Format, ...) \
189+
reportError(Format, __VA_ARGS__); \
190+
reportStackTrace(StackTrace); \
191+
reportAllocationInfo(ATI); \
192+
abortExecution();
193+
194+
static void reportDeallocationOfNonAllocatedPtr(void *DevicePtr,
195+
TargetAllocTy Kind,
196+
AllocationTraceInfoTy *ATI,
197+
std::string &StackTrace) {
198+
DEALLOCATION_ERROR("deallocation of non-allocated %s: %p",
199+
getAllocTyName(Kind).data(), DevicePtr);
200+
}
201+
202+
static void reportDeallocationOfDeallocatedPtr(void *DevicePtr,
203+
TargetAllocTy Kind,
204+
AllocationTraceInfoTy *ATI,
205+
std::string &StackTrace) {
206+
DEALLOCATION_ERROR("double-free of %s: %p", getAllocTyName(Kind).data(),
207+
DevicePtr);
208+
}
209+
210+
static void reportDeallocationOfWrongPtrKind(void *DevicePtr,
211+
TargetAllocTy Kind,
212+
AllocationTraceInfoTy *ATI,
213+
std::string &StackTrace) {
214+
DEALLOCATION_ERROR("deallocation requires %s but allocation was %s: %p",
215+
getAllocTyName(Kind).data(),
216+
getAllocTyName(ATI->Kind).data(), DevicePtr);
217+
#undef DEALLOCATION_ERROR
218+
}
219+
};
220+
221+
} // namespace plugin
222+
} // namespace target
223+
} // namespace omp
224+
} // namespace llvm
225+
226+
#endif // OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <shared_mutex>
2020
#include <vector>
2121

22+
#include "ExclusiveAccess.h"
2223
#include "Shared/APITypes.h"
2324
#include "Shared/Debug.h"
2425
#include "Shared/Environment.h"
@@ -382,6 +383,35 @@ struct GenericKernelTy {
382383
bool IsBareKernel = false;
383384
};
384385

386+
/// Information about an allocation, when it has been allocated, and when/if it
387+
/// has been deallocated, for error reporting purposes.
388+
struct AllocationTraceInfoTy {
389+
390+
/// The stack trace of the allocation itself.
391+
std::string AllocationTrace;
392+
393+
/// The stack trace of the deallocation, or empty.
394+
std::string DeallocationTrace;
395+
396+
/// The allocated device pointer.
397+
void *DevicePtr = nullptr;
398+
399+
/// The corresponding host pointer (can be null).
400+
void *HostPtr = nullptr;
401+
402+
/// The size of the allocation.
403+
uint64_t Size = 0;
404+
405+
/// The kind of the allocation.
406+
TargetAllocTy Kind = TargetAllocTy::TARGET_ALLOC_DEFAULT;
407+
408+
/// Information about the last allocation at this address, if any.
409+
AllocationTraceInfoTy *LastAllocationInfo = nullptr;
410+
411+
/// Lock to keep accesses race free.
412+
std::mutex Lock;
413+
};
414+
385415
/// Class representing a map of host pinned allocations. We track these pinned
386416
/// allocations, so memory tranfers invloving these buffers can be optimized.
387417
class PinnedAllocationMapTy {
@@ -866,6 +896,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
866896
/// Reference to the underlying plugin that created this device.
867897
GenericPluginTy &Plugin;
868898

899+
/// Map to record when allocations have been performed, and when they have
900+
/// been deallocated, both for error reporting purposes.
901+
ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
902+
869903
private:
870904
/// Get and set the stack size and heap size for the device. If not used, the
871905
/// plugin can implement the setters as no-op and setting the output
@@ -916,6 +950,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
916950
UInt32Envar OMPX_InitialNumStreams;
917951
UInt32Envar OMPX_InitialNumEvents;
918952

953+
/// Environment variable to determine if stack traces for allocations and
954+
/// deallocations are tracked.
955+
BoolEnvar OMPX_TrackAllocationTraces =
956+
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
957+
919958
/// Array of images loaded into the device. Images are automatically
920959
/// deallocated by the allocator.
921960
llvm::SmallVector<DeviceImageTy *> LoadedImages;

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "Shared/Debug.h"
1515
#include "Shared/Environment.h"
1616

17+
#include "ErrorReporting.h"
1718
#include "GlobalHandler.h"
1819
#include "JIT.h"
1920
#include "Utils/ELF.h"
@@ -30,6 +31,8 @@
3031
#include "llvm/Support/JSON.h"
3132
#include "llvm/Support/MathExtras.h"
3233
#include "llvm/Support/MemoryBuffer.h"
34+
#include "llvm/Support/Signals.h"
35+
#include "llvm/Support/raw_ostream.h"
3336

3437
#include <cstdint>
3538
#include <limits>
@@ -1337,6 +1340,25 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
13371340
if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
13381341
return std::move(Err);
13391342

1343+
// Keep track of the allocation stack if we track allocation traces.
1344+
if (OMPX_TrackAllocationTraces) {
1345+
std::string StackTrace;
1346+
llvm::raw_string_ostream OS(StackTrace);
1347+
llvm::sys::PrintStackTrace(OS);
1348+
1349+
AllocationTraceInfoTy *ATI = new AllocationTraceInfoTy();
1350+
ATI->AllocationTrace = std::move(StackTrace);
1351+
ATI->DevicePtr = Alloc;
1352+
ATI->HostPtr = HostPtr;
1353+
ATI->Size = Size;
1354+
ATI->Kind = Kind;
1355+
1356+
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
1357+
auto *&MapATI = (*AllocationTraceMap)[Alloc];
1358+
ATI->LastAllocationInfo = MapATI;
1359+
MapATI = ATI;
1360+
}
1361+
13401362
return Alloc;
13411363
}
13421364

@@ -1345,6 +1367,37 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
13451367
if (Plugin.getRecordReplay().isRecordingOrReplaying())
13461368
return Plugin::success();
13471369

1370+
// Keep track of the deallocation stack if we track allocation traces.
1371+
if (OMPX_TrackAllocationTraces) {
1372+
AllocationTraceInfoTy *ATI = nullptr;
1373+
{
1374+
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
1375+
ATI = (*AllocationTraceMap)[TgtPtr];
1376+
}
1377+
1378+
std::string StackTrace;
1379+
llvm::raw_string_ostream OS(StackTrace);
1380+
llvm::sys::PrintStackTrace(OS);
1381+
1382+
if (!ATI)
1383+
ErrorReporter::reportDeallocationOfNonAllocatedPtr(TgtPtr, Kind, ATI,
1384+
StackTrace);
1385+
1386+
// ATI is not null, thus we can lock it to inspect and modify it further.
1387+
std::lock_guard<std::mutex> LG(ATI->Lock);
1388+
if (!ATI->DeallocationTrace.empty())
1389+
ErrorReporter::reportDeallocationOfDeallocatedPtr(TgtPtr, Kind, ATI,
1390+
StackTrace);
1391+
1392+
if (ATI->Kind != Kind)
1393+
ErrorReporter::reportDeallocationOfWrongPtrKind(TgtPtr, Kind, ATI,
1394+
StackTrace);
1395+
1396+
ATI->DeallocationTrace = StackTrace;
1397+
1398+
#undef DEALLOCATION_ERROR
1399+
}
1400+
13481401
int Res;
13491402
switch (Kind) {
13501403
case TARGET_ALLOC_DEFAULT:

offload/src/omptarget.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,9 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
462462
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
463463

464464
if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
465-
FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr");
465+
FATAL_MESSAGE(DeviceNum, "%s",
466+
"Failed to deallocate device ptr. Set "
467+
"OFFLOAD_TRACK_ALLOCATION_TRACES=1 to track allocations.");
466468

467469
DP("omp_target_free deallocated device ptr\n");
468470
}

0 commit comments

Comments
 (0)