llvm · jdoerfert · Jul 31, 2024 · Jul 24, 2024 · jhuber6 · Jul 30, 2024
diff --git a/offload/include/Shared/EnvironmentVar.h b/offload/include/Shared/EnvironmentVar.h
@@ -28,6 +28,7 @@ struct StringParser {
 /// Class for reading and checking environment variables. Currently working with
 /// integer, floats, std::string and bool types.
 template <typename Ty> class Envar {
+  llvm::StringRef Name;
   Ty Data;
   bool IsPresent;
   bool Initialized;
@@ -53,7 +54,7 @@ template <typename Ty> class Envar {
   /// take the value read from the environment variable, or the default if it
   /// was not set or not correct. This constructor is not fallible.
   Envar(llvm::StringRef Name, Ty Default = Ty())
-      : Data(Default), IsPresent(false), Initialized(true) {
+      : Name(Name), Data(Default), IsPresent(false), Initialized(true) {
 
     if (const char *EnvStr = getenv(Name.data())) {
       // Check whether the envar is defined and valid.
@@ -84,6 +85,9 @@ template <typename Ty> class Envar {
   /// Get the definitive value.
   operator Ty() const { return get(); }
 
+  /// Return the environment variable name.
+  llvm::StringRef getName() const { return Name; }
+
   /// Indicate whether the environment variable was defined and valid.
   bool isPresent() const { return IsPresent; }
 

diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -31,6 +31,7 @@ typedef enum {
   HSA_STATUS_ERROR = 0x1000,
   HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
   HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
+  HSA_STATUS_ERROR_EXCEPTION = 0x1016,
 } hsa_status_t;
 
 hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -13,13 +13,16 @@
 #include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <deque>
+#include <functional>
 #include <mutex>
 #include <string>
 #include <system_error>
 #include <unistd.h>
 #include <unordered_map>
 
+#include "ErrorReporting.h"
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
@@ -43,6 +46,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 
 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
   AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
 
   /// Lazily initialize a new queue belonging to a specific agent.
-  Error init(hsa_agent_t Agent, int32_t QueueSize) {
+  Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
     if (Queue)
       return Plugin::success();
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
-                         nullptr, UINT32_MAX, UINT32_MAX, &Queue);
+                         &Device, UINT32_MAX, UINT32_MAX, &Queue);
     return Plugin::check(Status, "Error in hsa_queue_create: %s");
   }
 
@@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
   }
 
   /// Callack that will be called when an error is detected on the HSA queue.
-  static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
-    auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
-    FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
-  }
+  static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                            void *Data);
 
   /// The HSA queue.
   hsa_queue_t *Queue;
@@ -1484,6 +1486,8 @@ struct AMDGPUStreamTy {
     return true;
   }
 
+  const AMDGPUQueueTy *getQueue() const { return Queue; }
+
   /// Record the state of the stream on an event.
   Error recordEvent(AMDGPUEventTy &Event) const;
 
@@ -1594,7 +1598,7 @@ struct AMDGPUStreamManagerTy final
   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
 
   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
-      : GenericDeviceResourceManagerTy(Device),
+      : GenericDeviceResourceManagerTy(Device), Device(Device),
         OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
         NextQueue(0), Agent(HSAAgent) {}
 
@@ -1603,7 +1607,7 @@ struct AMDGPUStreamManagerTy final
     QueueSize = HSAQueueSize;
     MaxNumQueues = NumHSAQueues;
     // Initialize one queue eagerly
-    if (auto Err = Queues.front().init(Agent, QueueSize))
+    if (auto Err = Queues.front().init(Device, Agent, QueueSize))
       return Err;
 
     return GenericDeviceResourceManagerTy::init(InitialSize);
@@ -1660,14 +1664,17 @@ struct AMDGPUStreamManagerTy final
     }
 
     // Make sure the queue is initialized, then add user & assign.
-    if (auto Err = Queues[Index].init(Agent, QueueSize))
+    if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
       return Err;
     Queues[Index].addUser();
     Stream->Queue = &Queues[Index];
 
     return Plugin::success();
   }
 
+  /// The device associated with this stream.
+  GenericDeviceTy &Device;
+
   /// Envar for controlling the tracking of busy HSA queues.
   BoolEnvar OMPX_QueueTracking;
 
@@ -3074,7 +3081,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     Initialized = true;
 
     // Register event handler to detect memory errors on the devices.
-    Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
+    Status = hsa_amd_register_system_event_handler(eventHandler, this);
     if (auto Err = Plugin::check(
             Status, "Error in hsa_amd_register_system_event_handler: %s"))
       return std::move(Err);
@@ -3209,7 +3216,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
 private:
   /// Event handler that will be called by ROCr if an event is detected.
-  static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
+  static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
+                                   void *PluginPtr) {
     if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
       return HSA_STATUS_SUCCESS;
 
@@ -3240,6 +3248,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     uint32_t Node = -1;
     hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
 
+    AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
+    for (uint32_t I = 0, E = Plugin.getNumDevices();
+         Node != uint32_t(-1) && I < E; ++I) {
+      AMDGPUDeviceTy &AMDGPUDevice =
+          reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
+      auto KernelTraceInfoRecord =
+          AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+
+      uint32_t DeviceNode = -1;
+      if (auto Err =
+              AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
+        consumeError(std::move(Err));
+        continue;
+      }
+      if (DeviceNode != Node)
+        continue;
+
+      ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
+    }
+
     // Abort the execution since we do not recover from this error.
     FATAL_MESSAGE(1,
                   "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
@@ -3480,6 +3508,28 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   return Alloc;
 }
 
+void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                                  void *Data) {
+  auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
+
+  if (Status == HSA_STATUS_ERROR_EXCEPTION) {
+    auto KernelTraceInfoRecord =
+        AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+    std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
+        [=](__tgt_async_info &AsyncInfo) {
+          auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
+          if (!Stream || !Stream->getQueue())
+            return false;
+          return Stream->getQueue()->Queue == Source;
+        };
+    ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
+                                      AsyncInfoWrapperMatcher);
+  }
+
+  auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
+  FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
+}
+
 } // namespace plugin
 } // namespace target
 } // namespace omp

diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -14,6 +14,7 @@
 #include "PluginInterface.h"
 #include "Shared/EnvironmentVar.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -216,6 +217,90 @@ class ErrorReporter {
                        getAllocTyName(ATI->Kind).data(), DevicePtr);
 #undef DEALLOCATION_ERROR
   }
+
+  /// Report that a kernel encountered a trap instruction.
+  static void reportTrapInKernel(
+      GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
+      std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher) {
+    assert(AsyncInfoWrapperMatcher && "A matcher is required");
+
+    uint32_t Idx = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      // Skip kernels issued in other queues.
+      if (KTI.AsyncInfo && !(AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+        continue;
+      Idx = I;
+      break;
+    }
+
+    auto KTI = KTIR.getKernelTraceInfo(Idx);
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+      reportError("Kernel '%s'", KTI.Kernel->getName());
+    reportError("execution interrupted by hardware trap instruction");
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo))) {
+      if (!KTI.LaunchTrace.empty())
+        reportStackTrace(KTI.LaunchTrace);
+      else
+        print(Yellow, "Use '%s=1' to show the stack trace of the kernel\n",
+              Device.OMPX_TrackNumKernelLaunches.getName().data());
+    }
+    abort();
+  }
+
+  /// Report the kernel traces taken from \p KTIR, up to
+  /// OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES many.
+  static void reportKernelTraces(GenericDeviceTy &Device,
+                                 KernelTraceInfoRecordTy &KTIR) {
+    uint32_t NumKTIs = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      ++NumKTIs;
+    }
+    if (NumKTIs == 0) {
+      print(BoldRed, "No kernel launches known\n");
+      return;
+    }
+
+    uint32_t TracesToShow =
+        std::min(Device.OMPX_TrackNumKernelLaunches.get(), NumKTIs);
+    if (TracesToShow == 0) {
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Display only launched kernel:\n");
+      else
+        print(BoldLightPurple, "Display last %u kernels launched:\n", NumKTIs);
+    } else {
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Display kernel launch trace:\n");
+      else
+        print(BoldLightPurple,
+              "Display %u of the %u last kernel launch traces:\n", TracesToShow,
+              NumKTIs);
+    }
+
+    for (uint32_t Idx = 0, I = 0; I < NumKTIs; ++Idx) {
+      auto KTI = KTIR.getKernelTraceInfo(Idx);
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Kernel '%s'\n", KTI.Kernel->getName());
+      else
+        print(BoldLightPurple, "Kernel %d: '%s'\n", I, KTI.Kernel->getName());
+      reportStackTrace(KTI.LaunchTrace);
+      ++I;
+    }
+
+    if (NumKTIs != 1) {
+      print(Yellow,
+            "Use '%s=<num>' to adjust the number of shown stack traces (%u "
+            "now, up to %zu)\n",
+            Device.OMPX_TrackNumKernelLaunches.getName().data(),
+            Device.OMPX_TrackNumKernelLaunches.get(), KTIR.size());
+    }
+    // TODO: Let users know how to serialize kernels
+  }
 };
 
 } // namespace plugin

diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -412,6 +412,44 @@ struct AllocationTraceInfoTy {
   std::mutex Lock;
 };
 
+/// Information about an allocation, when it has been allocated, and when/if it
+/// has been deallocated, for error reporting purposes.
+struct KernelTraceInfoTy {
+
+  /// The launched kernel.
+  GenericKernelTy *Kernel;
+
+  /// The stack trace of the launch itself.
+  std::string LaunchTrace;
+
+  /// The async info the kernel was launched in.
+  __tgt_async_info *AsyncInfo;
+};
+
+struct KernelTraceInfoRecordTy {
+  KernelTraceInfoRecordTy() { KTIs.fill({}); }
+
+  /// Return the (maximal) record size.
+  auto size() const { return KTIs.size(); }
+
+  /// Create a new kernel trace info and add it into the record.
+  void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace,
+               __tgt_async_info *AsyncInfo) {
+    KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo};
+    Idx = (Idx + 1) % size();
+  }
+
+  /// Return the \p I'th last kernel trace info.
+  auto getKernelTraceInfo(int32_t I) const {
+    // Note that kernel trace infos "grow forward", so lookup is backwards.
+    return KTIs[(Idx - I - 1 + size()) % size()];
+  }
+
+private:
+  std::array<KernelTraceInfoTy, 8> KTIs;
+  unsigned Idx = 0;
+};
+
 /// Class representing a map of host pinned allocations. We track these pinned
 /// allocations, so memory tranfers invloving these buffers can be optimized.
 class PinnedAllocationMapTy {
@@ -900,6 +938,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// been deallocated, both for error reporting purposes.
   ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
 
+  /// Map to record kernel have been launchedl, for error reporting purposes.
+  ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
+
+  /// Environment variable to determine if stack traces for kernel launches are
+  /// tracked.
+  UInt32Envar OMPX_TrackNumKernelLaunches =
+      UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1468,6 +1468,18 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
   GenericKernelTy &GenericKernel =
       *reinterpret_cast<GenericKernelTy *>(EntryPtr);
 
+  {
+    std::string StackTrace;
+    if (OMPX_TrackNumKernelLaunches) {
+      llvm::raw_string_ostream OS(StackTrace);
+      llvm::sys::PrintStackTrace(OS);
+    }
+
+    auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
+    (*KernelTraceInfoRecord)
+        .emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
+  }
+
   auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
                                   AsyncInfoWrapper);