Skip to content

[Offload] Allow to record kernel launch stack traces #100472

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion offload/include/Shared/EnvironmentVar.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ struct StringParser {
/// Class for reading and checking environment variables. Currently working with
/// integer, floats, std::string and bool types.
template <typename Ty> class Envar {
llvm::StringRef Name;
Ty Data;
bool IsPresent;
bool Initialized;
Expand All @@ -53,7 +54,7 @@ template <typename Ty> class Envar {
/// take the value read from the environment variable, or the default if it
/// was not set or not correct. This constructor is not fallible.
Envar(llvm::StringRef Name, Ty Default = Ty())
: Data(Default), IsPresent(false), Initialized(true) {
: Name(Name), Data(Default), IsPresent(false), Initialized(true) {

if (const char *EnvStr = getenv(Name.data())) {
// Check whether the envar is defined and valid.
Expand Down Expand Up @@ -84,6 +85,9 @@ template <typename Ty> class Envar {
/// Get the definitive value.
operator Ty() const { return get(); }

/// Return the environment variable name.
llvm::StringRef getName() const { return Name; }

/// Indicate whether the environment variable was defined and valid.
bool isPresent() const { return IsPresent; }

Expand Down
1 change: 1 addition & 0 deletions offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ typedef enum {
HSA_STATUS_ERROR = 0x1000,
HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
HSA_STATUS_ERROR_EXCEPTION = 0x1016,
} hsa_status_t;

hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);
Expand Down
72 changes: 61 additions & 11 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
#include <atomic>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <deque>
#include <functional>
#include <mutex>
#include <string>
#include <system_error>
#include <unistd.h>
#include <unordered_map>

#include "ErrorReporting.h"
#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
Expand All @@ -43,6 +46,7 @@
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/raw_ostream.h"

#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
Expand Down Expand Up @@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}

/// Lazily initialize a new queue belonging to a specific agent.
Error init(hsa_agent_t Agent, int32_t QueueSize) {
Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
if (Queue)
return Plugin::success();
hsa_status_t Status =
hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
nullptr, UINT32_MAX, UINT32_MAX, &Queue);
&Device, UINT32_MAX, UINT32_MAX, &Queue);
return Plugin::check(Status, "Error in hsa_queue_create: %s");
}

Expand Down Expand Up @@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
}

/// Callack that will be called when an error is detected on the HSA queue.
static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
}
static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
void *Data);

/// The HSA queue.
hsa_queue_t *Queue;
Expand Down Expand Up @@ -1484,6 +1486,8 @@ struct AMDGPUStreamTy {
return true;
}

const AMDGPUQueueTy *getQueue() const { return Queue; }

/// Record the state of the stream on an event.
Error recordEvent(AMDGPUEventTy &Event) const;

Expand Down Expand Up @@ -1594,7 +1598,7 @@ struct AMDGPUStreamManagerTy final
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;

AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
: GenericDeviceResourceManagerTy(Device),
: GenericDeviceResourceManagerTy(Device), Device(Device),
OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
NextQueue(0), Agent(HSAAgent) {}

Expand All @@ -1603,7 +1607,7 @@ struct AMDGPUStreamManagerTy final
QueueSize = HSAQueueSize;
MaxNumQueues = NumHSAQueues;
// Initialize one queue eagerly
if (auto Err = Queues.front().init(Agent, QueueSize))
if (auto Err = Queues.front().init(Device, Agent, QueueSize))
return Err;

return GenericDeviceResourceManagerTy::init(InitialSize);
Expand Down Expand Up @@ -1660,14 +1664,17 @@ struct AMDGPUStreamManagerTy final
}

// Make sure the queue is initialized, then add user & assign.
if (auto Err = Queues[Index].init(Agent, QueueSize))
if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
return Err;
Queues[Index].addUser();
Stream->Queue = &Queues[Index];

return Plugin::success();
}

/// The device associated with this stream.
GenericDeviceTy &Device;

/// Envar for controlling the tracking of busy HSA queues.
BoolEnvar OMPX_QueueTracking;

Expand Down Expand Up @@ -3074,7 +3081,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
Initialized = true;

// Register event handler to detect memory errors on the devices.
Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
Status = hsa_amd_register_system_event_handler(eventHandler, this);
if (auto Err = Plugin::check(
Status, "Error in hsa_amd_register_system_event_handler: %s"))
return std::move(Err);
Expand Down Expand Up @@ -3209,7 +3216,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {

private:
/// Event handler that will be called by ROCr if an event is detected.
static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
void *PluginPtr) {
if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
return HSA_STATUS_SUCCESS;

Expand Down Expand Up @@ -3240,6 +3248,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
uint32_t Node = -1;
hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);

AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
for (uint32_t I = 0, E = Plugin.getNumDevices();
Node != uint32_t(-1) && I < E; ++I) {
AMDGPUDeviceTy &AMDGPUDevice =
reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
auto KernelTraceInfoRecord =
AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();

uint32_t DeviceNode = -1;
if (auto Err =
AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
consumeError(std::move(Err));
continue;
}
if (DeviceNode != Node)
continue;

ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
}

// Abort the execution since we do not recover from this error.
FATAL_MESSAGE(1,
"Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
Expand Down Expand Up @@ -3480,6 +3508,28 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
return Alloc;
}

void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
void *Data) {
auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);

if (Status == HSA_STATUS_ERROR_EXCEPTION) {
auto KernelTraceInfoRecord =
AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
[=](__tgt_async_info &AsyncInfo) {
auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
if (!Stream || !Stream->getQueue())
return false;
return Stream->getQueue()->Queue == Source;
};
ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
AsyncInfoWrapperMatcher);
}

auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
}

} // namespace plugin
} // namespace target
} // namespace omp
Expand Down
85 changes: 85 additions & 0 deletions offload/plugins-nextgen/common/include/ErrorReporting.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "PluginInterface.h"
#include "Shared/EnvironmentVar.h"

#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/ErrorHandling.h"
Expand Down Expand Up @@ -216,6 +217,90 @@ class ErrorReporter {
getAllocTyName(ATI->Kind).data(), DevicePtr);
#undef DEALLOCATION_ERROR
}

/// Report that a kernel encountered a trap instruction.
static void reportTrapInKernel(
GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher) {
assert(AsyncInfoWrapperMatcher && "A matcher is required");

uint32_t Idx = 0;
for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
auto KTI = KTIR.getKernelTraceInfo(I);
Comment on lines +228 to +229
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does llvm::enumerate work here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likely, is that better? It has no precedent in Offload rn, maybe worth introducing in one swoop, if it is preferred.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a huge deal, just tends to be cleaner.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can change it here for the final version, assuming I won't encounter any problems.

if (KTI.Kernel == nullptr)
break;
// Skip kernels issued in other queues.
if (KTI.AsyncInfo && !(AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
continue;
Idx = I;
break;
}

auto KTI = KTIR.getKernelTraceInfo(Idx);
if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
reportError("Kernel '%s'", KTI.Kernel->getName());
reportError("execution interrupted by hardware trap instruction");
if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo))) {
if (!KTI.LaunchTrace.empty())
reportStackTrace(KTI.LaunchTrace);
else
print(Yellow, "Use '%s=1' to show the stack trace of the kernel\n",
Device.OMPX_TrackNumKernelLaunches.getName().data());
}
abort();
}

/// Report the kernel traces taken from \p KTIR, up to
/// OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES many.
static void reportKernelTraces(GenericDeviceTy &Device,
KernelTraceInfoRecordTy &KTIR) {
uint32_t NumKTIs = 0;
for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
auto KTI = KTIR.getKernelTraceInfo(I);
if (KTI.Kernel == nullptr)
break;
++NumKTIs;
}
if (NumKTIs == 0) {
print(BoldRed, "No kernel launches known\n");
return;
}

uint32_t TracesToShow =
std::min(Device.OMPX_TrackNumKernelLaunches.get(), NumKTIs);
if (TracesToShow == 0) {
if (NumKTIs == 1)
print(BoldLightPurple, "Display only launched kernel:\n");
else
print(BoldLightPurple, "Display last %u kernels launched:\n", NumKTIs);
} else {
if (NumKTIs == 1)
print(BoldLightPurple, "Display kernel launch trace:\n");
else
print(BoldLightPurple,
"Display %u of the %u last kernel launch traces:\n", TracesToShow,
NumKTIs);
}

for (uint32_t Idx = 0, I = 0; I < NumKTIs; ++Idx) {
auto KTI = KTIR.getKernelTraceInfo(Idx);
if (NumKTIs == 1)
print(BoldLightPurple, "Kernel '%s'\n", KTI.Kernel->getName());
else
print(BoldLightPurple, "Kernel %d: '%s'\n", I, KTI.Kernel->getName());
reportStackTrace(KTI.LaunchTrace);
++I;
}

if (NumKTIs != 1) {
print(Yellow,
"Use '%s=<num>' to adjust the number of shown stack traces (%u "
"now, up to %zu)\n",
Device.OMPX_TrackNumKernelLaunches.getName().data(),
Device.OMPX_TrackNumKernelLaunches.get(), KTIR.size());
}
// TODO: Let users know how to serialize kernels
}
};

} // namespace plugin
Expand Down
46 changes: 46 additions & 0 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,44 @@ struct AllocationTraceInfoTy {
std::mutex Lock;
};

/// Information about an allocation, when it has been allocated, and when/if it
/// has been deallocated, for error reporting purposes.
struct KernelTraceInfoTy {

/// The launched kernel.
GenericKernelTy *Kernel;

/// The stack trace of the launch itself.
std::string LaunchTrace;

/// The async info the kernel was launched in.
__tgt_async_info *AsyncInfo;
};

struct KernelTraceInfoRecordTy {
KernelTraceInfoRecordTy() { KTIs.fill({}); }

/// Return the (maximal) record size.
auto size() const { return KTIs.size(); }

/// Create a new kernel trace info and add it into the record.
void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace,
__tgt_async_info *AsyncInfo) {
KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo};
Idx = (Idx + 1) % size();
}

/// Return the \p I'th last kernel trace info.
auto getKernelTraceInfo(int32_t I) const {
// Note that kernel trace infos "grow forward", so lookup is backwards.
return KTIs[(Idx - I - 1 + size()) % size()];
}

private:
std::array<KernelTraceInfoTy, 8> KTIs;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we using a static array here? We could use SmallVector<x, 8> and get the same stack layout, or is there a need for no dynamic resizing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now, I use it as a fixed size ring buffer. It is unclear to me why one would store more than the last few traces, at least for now.

unsigned Idx = 0;
};

/// Class representing a map of host pinned allocations. We track these pinned
/// allocations, so memory tranfers invloving these buffers can be optimized.
class PinnedAllocationMapTy {
Expand Down Expand Up @@ -900,6 +938,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// been deallocated, both for error reporting purposes.
ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;

/// Map to record kernel have been launchedl, for error reporting purposes.
ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;

/// Environment variable to determine if stack traces for kernel launches are
/// tracked.
UInt32Envar OMPX_TrackNumKernelLaunches =
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);

private:
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
Expand Down
12 changes: 12 additions & 0 deletions offload/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1468,6 +1468,18 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
GenericKernelTy &GenericKernel =
*reinterpret_cast<GenericKernelTy *>(EntryPtr);

{
std::string StackTrace;
if (OMPX_TrackNumKernelLaunches) {
llvm::raw_string_ostream OS(StackTrace);
llvm::sys::PrintStackTrace(OS);
}

auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
(*KernelTraceInfoRecord)
.emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
}

auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
AsyncInfoWrapper);

Expand Down
Loading
Loading