Skip to content

Commit 6444ee5

Browse files
committed
[Offload] Allow to record kernel launch stack traces
Similar to (de)allocation traces, we can record kernel launch stack traces and display them in case of an error. However, the AMD GPU plugin signal handler, which is invoked on memroy faults, cannot pinpoint the offending kernel. Insteade print `<NUM>`, set via `OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=<NUM>`, many traces. The recoding/record uses a ring buffer of fixed size (for now 8). For `trap` errors, we print the actual kernel name, and trace if recorded.
1 parent c3a3bab commit 6444ee5

File tree

14 files changed

+544
-21
lines changed

14 files changed

+544
-21
lines changed

offload/include/Shared/EnvironmentVar.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct StringParser {
2828
/// Class for reading and checking environment variables. Currently working with
2929
/// integer, floats, std::string and bool types.
3030
template <typename Ty> class Envar {
31+
llvm::StringRef Name;
3132
Ty Data;
3233
bool IsPresent;
3334
bool Initialized;
@@ -53,7 +54,7 @@ template <typename Ty> class Envar {
5354
/// take the value read from the environment variable, or the default if it
5455
/// was not set or not correct. This constructor is not fallible.
5556
Envar(llvm::StringRef Name, Ty Default = Ty())
56-
: Data(Default), IsPresent(false), Initialized(true) {
57+
: Name(Name), Data(Default), IsPresent(false), Initialized(true) {
5758

5859
if (const char *EnvStr = getenv(Name.data())) {
5960
// Check whether the envar is defined and valid.
@@ -84,6 +85,9 @@ template <typename Ty> class Envar {
8485
/// Get the definitive value.
8586
operator Ty() const { return get(); }
8687

88+
/// Return the environment variable name.
89+
llvm::StringRef getName() const { return Name; }
90+
8791
/// Indicate whether the environment variable was defined and valid.
8892
bool isPresent() const { return IsPresent; }
8993

offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ typedef enum {
3131
HSA_STATUS_ERROR = 0x1000,
3232
HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
3333
HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
34+
HSA_STATUS_ERROR_EXCEPTION = 0x1016,
3435
} hsa_status_t;
3536

3637
hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@
1313
#include <atomic>
1414
#include <cassert>
1515
#include <cstddef>
16+
#include <cstdint>
1617
#include <deque>
18+
#include <functional>
1719
#include <mutex>
1820
#include <string>
1921
#include <system_error>
2022
#include <unistd.h>
2123
#include <unordered_map>
2224

25+
#include "ErrorReporting.h"
2326
#include "Shared/APITypes.h"
2427
#include "Shared/Debug.h"
2528
#include "Shared/Environment.h"
@@ -43,6 +46,7 @@
4346
#include "llvm/Support/FileSystem.h"
4447
#include "llvm/Support/MemoryBuffer.h"
4548
#include "llvm/Support/Program.h"
49+
#include "llvm/Support/Signals.h"
4650
#include "llvm/Support/raw_ostream.h"
4751

4852
#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
685689
AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
686690

687691
/// Lazily initialize a new queue belonging to a specific agent.
688-
Error init(hsa_agent_t Agent, int32_t QueueSize) {
692+
Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
689693
if (Queue)
690694
return Plugin::success();
691695
hsa_status_t Status =
692696
hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
693-
nullptr, UINT32_MAX, UINT32_MAX, &Queue);
697+
&Device, UINT32_MAX, UINT32_MAX, &Queue);
694698
return Plugin::check(Status, "Error in hsa_queue_create: %s");
695699
}
696700

@@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
875879
}
876880

877881
/// Callack that will be called when an error is detected on the HSA queue.
878-
static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
879-
auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
880-
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
881-
}
882+
static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
883+
void *Data);
882884

883885
/// The HSA queue.
884886
hsa_queue_t *Queue;
@@ -1214,6 +1216,9 @@ struct AMDGPUStreamTy {
12141216
/// Deinitialize the stream's signals.
12151217
Error deinit() { return Plugin::success(); }
12161218

1219+
/// Return the associated (device) agent.
1220+
hsa_agent_t getAgent() const { return Agent; }
1221+
12171222
/// Attach an RPC server to this stream.
12181223
void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
12191224

@@ -1484,6 +1489,8 @@ struct AMDGPUStreamTy {
14841489
return true;
14851490
}
14861491

1492+
const AMDGPUQueueTy *getQueue() const { return Queue; }
1493+
14871494
/// Record the state of the stream on an event.
14881495
Error recordEvent(AMDGPUEventTy &Event) const;
14891496

@@ -1594,7 +1601,7 @@ struct AMDGPUStreamManagerTy final
15941601
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
15951602

15961603
AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1597-
: GenericDeviceResourceManagerTy(Device),
1604+
: GenericDeviceResourceManagerTy(Device), Device(Device),
15981605
OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
15991606
NextQueue(0), Agent(HSAAgent) {}
16001607

@@ -1603,7 +1610,7 @@ struct AMDGPUStreamManagerTy final
16031610
QueueSize = HSAQueueSize;
16041611
MaxNumQueues = NumHSAQueues;
16051612
// Initialize one queue eagerly
1606-
if (auto Err = Queues.front().init(Agent, QueueSize))
1613+
if (auto Err = Queues.front().init(Device, Agent, QueueSize))
16071614
return Err;
16081615

16091616
return GenericDeviceResourceManagerTy::init(InitialSize);
@@ -1660,14 +1667,17 @@ struct AMDGPUStreamManagerTy final
16601667
}
16611668

16621669
// Make sure the queue is initialized, then add user & assign.
1663-
if (auto Err = Queues[Index].init(Agent, QueueSize))
1670+
if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
16641671
return Err;
16651672
Queues[Index].addUser();
16661673
Stream->Queue = &Queues[Index];
16671674

16681675
return Plugin::success();
16691676
}
16701677

1678+
/// The device associated with this stream.
1679+
GenericDeviceTy &Device;
1680+
16711681
/// Envar for controlling the tracking of busy HSA queues.
16721682
BoolEnvar OMPX_QueueTracking;
16731683

@@ -3074,7 +3084,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
30743084
Initialized = true;
30753085

30763086
// Register event handler to detect memory errors on the devices.
3077-
Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
3087+
Status = hsa_amd_register_system_event_handler(eventHandler, this);
30783088
if (auto Err = Plugin::check(
30793089
Status, "Error in hsa_amd_register_system_event_handler: %s"))
30803090
return std::move(Err);
@@ -3209,7 +3219,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
32093219

32103220
private:
32113221
/// Event handler that will be called by ROCr if an event is detected.
3212-
static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
3222+
static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
3223+
void *PluginPtr) {
32133224
if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
32143225
return HSA_STATUS_SUCCESS;
32153226

@@ -3240,6 +3251,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
32403251
uint32_t Node = -1;
32413252
hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
32423253

3254+
AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
3255+
for (uint32_t I = 0, E = Plugin.getNumDevices();
3256+
Node != uint32_t(-1) && I < E; ++I) {
3257+
AMDGPUDeviceTy &AMDGPUDevice =
3258+
reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
3259+
auto KernelTraceInfoRecord =
3260+
AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3261+
3262+
uint32_t DeviceNode = -1;
3263+
if (auto Err =
3264+
AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
3265+
consumeError(std::move(Err));
3266+
continue;
3267+
}
3268+
if (DeviceNode != Node)
3269+
continue;
3270+
3271+
ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
3272+
}
3273+
32433274
// Abort the execution since we do not recover from this error.
32443275
FATAL_MESSAGE(1,
32453276
"Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
@@ -3480,6 +3511,30 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
34803511
return Alloc;
34813512
}
34823513

3514+
void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
3515+
void *Data) {
3516+
auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
3517+
3518+
if (Status == HSA_STATUS_ERROR_EXCEPTION) {
3519+
auto KernelTraceInfoRecord =
3520+
AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
3521+
std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
3522+
[=](__tgt_async_info &AsyncInfo) {
3523+
auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
3524+
if (!Stream)
3525+
return false;
3526+
if (!Stream->getQueue())
3527+
return false;
3528+
return Stream->getQueue()->Queue == Source;
3529+
};
3530+
ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
3531+
AsyncInfoWrapperMatcher);
3532+
}
3533+
3534+
auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
3535+
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
3536+
}
3537+
34833538
} // namespace plugin
34843539
} // namespace target
34853540
} // namespace omp

offload/plugins-nextgen/common/include/ErrorReporting.h

Lines changed: 102 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@
1212
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
1313

1414
#include "PluginInterface.h"
15+
#include "Shared/EnvironmentVar.h"
1516

1617
#include "llvm/ADT/SmallString.h"
1718
#include "llvm/ADT/StringRef.h"
1819
#include "llvm/Support/ErrorHandling.h"
1920

21+
#include <cstdint>
2022
#include <cstdio>
2123
#include <cstdlib>
24+
#include <functional>
25+
#include <optional>
2226
#include <string>
2327

2428
namespace llvm {
@@ -84,11 +88,24 @@ class ErrorReporter {
8488

8589
/// Print \p Format, instantiated with \p Args to stderr.
8690
/// TODO: Allow redirection into a file stream.
91+
#pragma clang diagnostic push
92+
#pragma clang diagnostic ignored "-Wgcc-compat"
93+
#pragma clang diagnostic ignored "-Wformat-security"
8794
template <typename... ArgsTy>
88-
static void print(const char *Format, ArgsTy &&...Args) {
95+
[[gnu::format(__printf__, 1, 2)]] static void print(const char *Format,
96+
ArgsTy &&...Args) {
8997
fprintf(stderr, Format, std::forward<ArgsTy>(Args)...);
9098
}
9199

100+
/// Report an error.
101+
template <typename... ArgsTy>
102+
[[gnu::format(__printf__, 1, 2)]] static void reportError(const char *Format,
103+
ArgsTy &&...Args) {
104+
print(getCString("%s%s%s\n%s", Red(), ErrorBanner, Format, Default()),
105+
Args...);
106+
}
107+
#pragma clang diagnostic pop
108+
92109
/// Pretty print a stack trace.
93110
static void reportStackTrace(StringRef StackTrace) {
94111
if (StackTrace.empty())
@@ -100,6 +117,7 @@ class ErrorReporter {
100117
for (int I = Start, E = Lines.size(); I < E; ++I) {
101118
auto Line = Lines[I];
102119
Parts.clear();
120+
Line = Line.drop_while([](char C) { return std::isspace(C); });
103121
Line.split(Parts, " ", /*MaxSplit=*/2);
104122
if (Parts.size() != 3 || Parts[0].size() < 2 || Parts[0][0] != '#') {
105123
print("%s\n", Line.str().c_str());
@@ -116,19 +134,13 @@ class ErrorReporter {
116134
printf("\n");
117135
}
118136

119-
/// Report an error.
120-
static void reportError(const char *Message, StringRef StackTrace) {
121-
print("%s%s%s\n%s", Red(), ErrorBanner, Message, Default());
122-
reportStackTrace(StackTrace);
123-
}
124-
125137
/// Report information about an allocation associated with \p ATI.
126138
static void reportAllocationInfo(AllocationTraceInfoTy *ATI) {
127139
if (!ATI)
128140
return;
129141

130142
if (!ATI->DeallocationTrace.empty()) {
131-
print("%s%s%s\n%s", Cyan(), "Last deallocation:", Default());
143+
print("%s%s\n%s", Cyan(), "Last deallocation:", Default());
132144
reportStackTrace(ATI->DeallocationTrace);
133145
}
134146

@@ -166,7 +178,8 @@ class ErrorReporter {
166178
TargetAllocTy Kind, AllocationTraceInfoTy *ATI,
167179
std::string &StackTrace) {
168180
#define DEALLOCATION_ERROR(Format, ...) \
169-
reportError(getCString(Format, __VA_ARGS__), StackTrace); \
181+
reportError(Format, __VA_ARGS__); \
182+
reportStackTrace(StackTrace); \
170183
reportAllocationInfo(ATI); \
171184
abort();
172185

@@ -190,6 +203,86 @@ class ErrorReporter {
190203

191204
#undef DEALLOCATION_ERROR
192205
}
206+
207+
/// Report that a kernel encountered a trap instruction.
208+
static void reportTrapInKernel(
209+
GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
210+
std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher) {
211+
assert(AsyncInfoWrapperMatcher && "A matcher is required");
212+
213+
uint32_t Idx = 0;
214+
for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
215+
auto KTI = KTIR.getKernelTraceInfo(I);
216+
if (KTI.Kernel == nullptr)
217+
break;
218+
// Skip kernels issued in other queues.
219+
if (KTI.AsyncInfo && !(AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
220+
continue;
221+
Idx = I;
222+
break;
223+
}
224+
225+
auto KTI = KTIR.getKernelTraceInfo(Idx);
226+
if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
227+
reportError("Kernel '%s'", KTI.Kernel->getName());
228+
reportError("execution interrupted by hardware trap instruction");
229+
if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
230+
reportStackTrace(KTI.LaunchTrace);
231+
abort();
232+
}
233+
234+
/// Report the kernel traces taken from \p KTIR, up to
235+
/// OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES many.
236+
static void reportKernelTraces(GenericDeviceTy &Device,
237+
KernelTraceInfoRecordTy &KTIR) {
238+
uint32_t NumKTIs = 0;
239+
for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
240+
auto KTI = KTIR.getKernelTraceInfo(I);
241+
if (KTI.Kernel == nullptr)
242+
break;
243+
++NumKTIs;
244+
}
245+
if (NumKTIs == 0) {
246+
print("%sNo kernel launches known\n%s", Red(), Default());
247+
return;
248+
}
249+
250+
uint32_t TracesToShow =
251+
std::min(Device.OMPX_TrackNumKernelLaunches.get(), NumKTIs);
252+
if (TracesToShow == 0) {
253+
if (NumKTIs == 1) {
254+
print("%sDisplay only launched kernel:\n%s", Cyan(), Default());
255+
} else {
256+
print("%sDisplay last %u kernels launched:\n%s", Cyan(), NumKTIs,
257+
Default());
258+
}
259+
} else {
260+
if (NumKTIs == 1) {
261+
print("%sDisplay kernel launch trace:\n%s", Cyan(), Default());
262+
} else {
263+
print("%sDisplay %u of the %u last kernel launch traces:\n%s", Cyan(),
264+
TracesToShow, NumKTIs, Default());
265+
}
266+
}
267+
268+
for (uint32_t Idx = 0, I = 0; I < NumKTIs; ++Idx) {
269+
auto KTI = KTIR.getKernelTraceInfo(Idx);
270+
if (NumKTIs == 1) {
271+
print("%sKernel '%s'\n%s", Magenta(), KTI.Kernel->getName(), Default());
272+
} else {
273+
print("%sKernel %d: '%s'\n%s", Magenta(), I, KTI.Kernel->getName(),
274+
Default());
275+
}
276+
reportStackTrace(KTI.LaunchTrace);
277+
++I;
278+
}
279+
280+
if (NumKTIs != 1) {
281+
print("Use '%s=<num>' to adjust the number of shown traces (up to %zu)\n",
282+
Device.OMPX_TrackNumKernelLaunches.getName().data(), KTIR.size());
283+
}
284+
// TODO: Let users know how to serialize kernels
285+
}
193286
};
194287

195288
} // namespace plugin

0 commit comments

Comments
 (0)