llvm
diff --git a/‎offload/include/Shared/EnvironmentVar.h
Lines changed: 5 additions & 1 deletion b/‎offload/include/Shared/EnvironmentVar.h
Lines changed: 5 additions & 1 deletion
diff --git a/‎offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
Lines changed: 1 addition & 0 deletions b/‎offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎offload/plugins-nextgen/amdgpu/src/rtl.cpp
Lines changed: 66 additions & 11 deletions b/‎offload/plugins-nextgen/amdgpu/src/rtl.cpp
Lines changed: 66 additions & 11 deletions
diff --git a/‎offload/plugins-nextgen/common/include/ErrorReporting.h
Lines changed: 102 additions & 9 deletions b/‎offload/plugins-nextgen/common/include/ErrorReporting.h
Lines changed: 102 additions & 9 deletions
@@ -28,6 +28,7 @@ struct StringParser {
 /// Class for reading and checking environment variables. Currently working with
 /// integer, floats, std::string and bool types.
 template <typename Ty> class Envar {
+  llvm::StringRef Name;
   Ty Data;
   bool IsPresent;
   bool Initialized;
@@ -53,7 +54,7 @@ template <typename Ty> class Envar {
   /// take the value read from the environment variable, or the default if it
   /// was not set or not correct. This constructor is not fallible.
   Envar(llvm::StringRef Name, Ty Default = Ty())
-      : Data(Default), IsPresent(false), Initialized(true) {
+      : Name(Name), Data(Default), IsPresent(false), Initialized(true) {
 
     if (const char *EnvStr = getenv(Name.data())) {
       // Check whether the envar is defined and valid.
@@ -84,6 +85,9 @@ template <typename Ty> class Envar {
   /// Get the definitive value.
   operator Ty() const { return get(); }
 
+  /// Return the environment variable name.
+  llvm::StringRef getName() const { return Name; }
+
   /// Indicate whether the environment variable was defined and valid.
   bool isPresent() const { return IsPresent; }
 
 
@@ -31,6 +31,7 @@ typedef enum {
   HSA_STATUS_ERROR = 0x1000,
   HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
   HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
+  HSA_STATUS_ERROR_EXCEPTION = 0x1016,
 } hsa_status_t;
 
 hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);
 
@@ -13,13 +13,16 @@
 #include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <deque>
+#include <functional>
 #include <mutex>
 #include <string>
 #include <system_error>
 #include <unistd.h>
 #include <unordered_map>
 
+#include "ErrorReporting.h"
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
@@ -43,6 +46,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 
 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
   AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
 
   /// Lazily initialize a new queue belonging to a specific agent.
-  Error init(hsa_agent_t Agent, int32_t QueueSize) {
+  Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
     if (Queue)
       return Plugin::success();
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
-                         nullptr, UINT32_MAX, UINT32_MAX, &Queue);
+                         &Device, UINT32_MAX, UINT32_MAX, &Queue);
     return Plugin::check(Status, "Error in hsa_queue_create: %s");
   }
 
@@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
   }
 
   /// Callack that will be called when an error is detected on the HSA queue.
-  static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
-    auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
-    FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
-  }
+  static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                            void *Data);
 
   /// The HSA queue.
   hsa_queue_t *Queue;
@@ -1214,6 +1216,9 @@ struct AMDGPUStreamTy {
   /// Deinitialize the stream's signals.
   Error deinit() { return Plugin::success(); }
 
+  /// Return the associated (device) agent.
+  hsa_agent_t getAgent() const { return Agent; }
+
   /// Attach an RPC server to this stream.
   void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
 
@@ -1484,6 +1489,8 @@ struct AMDGPUStreamTy {
     return true;
   }
 
+  const AMDGPUQueueTy *getQueue() const { return Queue; }
+
   /// Record the state of the stream on an event.
   Error recordEvent(AMDGPUEventTy &Event) const;
 
@@ -1594,7 +1601,7 @@ struct AMDGPUStreamManagerTy final
   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
 
   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
-      : GenericDeviceResourceManagerTy(Device),
+      : GenericDeviceResourceManagerTy(Device), Device(Device),
         OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
         NextQueue(0), Agent(HSAAgent) {}
 
@@ -1603,7 +1610,7 @@ struct AMDGPUStreamManagerTy final
     QueueSize = HSAQueueSize;
     MaxNumQueues = NumHSAQueues;
     // Initialize one queue eagerly
-    if (auto Err = Queues.front().init(Agent, QueueSize))
+    if (auto Err = Queues.front().init(Device, Agent, QueueSize))
       return Err;
 
     return GenericDeviceResourceManagerTy::init(InitialSize);
@@ -1660,14 +1667,17 @@ struct AMDGPUStreamManagerTy final
     }
 
     // Make sure the queue is initialized, then add user & assign.
-    if (auto Err = Queues[Index].init(Agent, QueueSize))
+    if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
       return Err;
     Queues[Index].addUser();
     Stream->Queue = &Queues[Index];
 
     return Plugin::success();
   }
 
+  /// The device associated with this stream.
+  GenericDeviceTy &Device;
+
   /// Envar for controlling the tracking of busy HSA queues.
   BoolEnvar OMPX_QueueTracking;
 
@@ -3074,7 +3084,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     Initialized = true;
 
     // Register event handler to detect memory errors on the devices.
-    Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
+    Status = hsa_amd_register_system_event_handler(eventHandler, this);
     if (auto Err = Plugin::check(
             Status, "Error in hsa_amd_register_system_event_handler: %s"))
       return std::move(Err);
@@ -3209,7 +3219,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
 private:
   /// Event handler that will be called by ROCr if an event is detected.
-  static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
+  static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
+                                   void *PluginPtr) {
     if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
       return HSA_STATUS_SUCCESS;
 
@@ -3240,6 +3251,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     uint32_t Node = -1;
     hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
 
+    AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
+    for (uint32_t I = 0, E = Plugin.getNumDevices();
+         Node != uint32_t(-1) && I < E; ++I) {
+      AMDGPUDeviceTy &AMDGPUDevice =
+          reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
+      auto KernelTraceInfoRecord =
+          AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+
+      uint32_t DeviceNode = -1;
+      if (auto Err =
+              AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
+        consumeError(std::move(Err));
+        continue;
+      }
+      if (DeviceNode != Node)
+        continue;
+
+      ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
+    }
+
     // Abort the execution since we do not recover from this error.
     FATAL_MESSAGE(1,
                   "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
@@ -3480,6 +3511,30 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   return Alloc;
 }
 
+void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                                  void *Data) {
+  auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
+
+  if (Status == HSA_STATUS_ERROR_EXCEPTION) {
+    auto KernelTraceInfoRecord =
+        AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+    std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
+        [=](__tgt_async_info &AsyncInfo) {
+          auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
+          if (!Stream)
+            return false;
+          if (!Stream->getQueue())
+            return false;
+          return Stream->getQueue()->Queue == Source;
+        };
+    ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
+                                      AsyncInfoWrapperMatcher);
+  }
+
+  auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
+  FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
+}
+
 } // namespace plugin
 } // namespace target
 } // namespace omp
 
@@ -12,13 +12,17 @@
 #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
 
 #include "PluginInterface.h"
+#include "Shared/EnvironmentVar.h"
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <functional>
+#include <optional>
 #include <string>
 
 namespace llvm {
@@ -84,11 +88,24 @@ class ErrorReporter {
 
   /// Print \p Format, instantiated with \p Args to stderr.
   /// TODO: Allow redirection into a file stream.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgcc-compat"
+#pragma clang diagnostic ignored "-Wformat-security"
   template <typename... ArgsTy>
-  static void print(const char *Format, ArgsTy &&...Args) {
+  [[gnu::format(__printf__, 1, 2)]] static void print(const char *Format,
+                                                      ArgsTy &&...Args) {
     fprintf(stderr, Format, std::forward<ArgsTy>(Args)...);
   }
 
+  /// Report an error.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 1, 2)]] static void reportError(const char *Format,
+                                                            ArgsTy &&...Args) {
+    print(getCString("%s%s%s\n%s", Red(), ErrorBanner, Format, Default()),
+          Args...);
+  }
+#pragma clang diagnostic pop
+
   /// Pretty print a stack trace.
   static void reportStackTrace(StringRef StackTrace) {
     if (StackTrace.empty())
@@ -100,6 +117,7 @@ class ErrorReporter {
     for (int I = Start, E = Lines.size(); I < E; ++I) {
       auto Line = Lines[I];
       Parts.clear();
+      Line = Line.drop_while([](char C) { return std::isspace(C); });
       Line.split(Parts, " ", /*MaxSplit=*/2);
       if (Parts.size() != 3 || Parts[0].size() < 2 || Parts[0][0] != '#') {
         print("%s\n", Line.str().c_str());
@@ -116,19 +134,13 @@ class ErrorReporter {
     printf("\n");
   }
 
-  /// Report an error.
-  static void reportError(const char *Message, StringRef StackTrace) {
-    print("%s%s%s\n%s", Red(), ErrorBanner, Message, Default());
-    reportStackTrace(StackTrace);
-  }
-
   /// Report information about an allocation associated with \p ATI.
   static void reportAllocationInfo(AllocationTraceInfoTy *ATI) {
     if (!ATI)
       return;
 
     if (!ATI->DeallocationTrace.empty()) {
-      print("%s%s%s\n%s", Cyan(), "Last deallocation:", Default());
+      print("%s%s\n%s", Cyan(), "Last deallocation:", Default());
       reportStackTrace(ATI->DeallocationTrace);
     }
 
@@ -166,7 +178,8 @@ class ErrorReporter {
                                 TargetAllocTy Kind, AllocationTraceInfoTy *ATI,
                                 std::string &StackTrace) {
 #define DEALLOCATION_ERROR(Format, ...)                                        \
-  reportError(getCString(Format, __VA_ARGS__), StackTrace);                    \
+  reportError(Format, __VA_ARGS__);                                            \
+  reportStackTrace(StackTrace);                                                \
   reportAllocationInfo(ATI);                                                   \
   abort();
 
@@ -190,6 +203,86 @@ class ErrorReporter {
 
 #undef DEALLOCATION_ERROR
   }
+
+  /// Report that a kernel encountered a trap instruction.
+  static void reportTrapInKernel(
+      GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
+      std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher) {
+    assert(AsyncInfoWrapperMatcher && "A matcher is required");
+
+    uint32_t Idx = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      // Skip kernels issued in other queues.
+      if (KTI.AsyncInfo && !(AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+        continue;
+      Idx = I;
+      break;
+    }
+
+    auto KTI = KTIR.getKernelTraceInfo(Idx);
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+      reportError("Kernel '%s'", KTI.Kernel->getName());
+    reportError("execution interrupted by hardware trap instruction");
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+      reportStackTrace(KTI.LaunchTrace);
+    abort();
+  }
+
+  /// Report the kernel traces taken from \p KTIR, up to
+  /// OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES many.
+  static void reportKernelTraces(GenericDeviceTy &Device,
+                                 KernelTraceInfoRecordTy &KTIR) {
+    uint32_t NumKTIs = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      ++NumKTIs;
+    }
+    if (NumKTIs == 0) {
+      print("%sNo kernel launches known\n%s", Red(), Default());
+      return;
+    }
+
+    uint32_t TracesToShow =
+        std::min(Device.OMPX_TrackNumKernelLaunches.get(), NumKTIs);
+    if (TracesToShow == 0) {
+      if (NumKTIs == 1) {
+        print("%sDisplay only launched kernel:\n%s", Cyan(), Default());
+      } else {
+        print("%sDisplay last %u kernels launched:\n%s", Cyan(), NumKTIs,
+              Default());
+      }
+    } else {
+      if (NumKTIs == 1) {
+        print("%sDisplay kernel launch trace:\n%s", Cyan(), Default());
+      } else {
+        print("%sDisplay %u of the %u last kernel launch traces:\n%s", Cyan(),
+              TracesToShow, NumKTIs, Default());
+      }
+    }
+
+    for (uint32_t Idx = 0, I = 0; I < NumKTIs; ++Idx) {
+      auto KTI = KTIR.getKernelTraceInfo(Idx);
+      if (NumKTIs == 1) {
+        print("%sKernel '%s'\n%s", Magenta(), KTI.Kernel->getName(), Default());
+      } else {
+        print("%sKernel %d: '%s'\n%s", Magenta(), I, KTI.Kernel->getName(),
+              Default());
+      }
+      reportStackTrace(KTI.LaunchTrace);
+      ++I;
+    }
+
+    if (NumKTIs != 1) {
+      print("Use '%s=<num>' to adjust the number of shown traces (up to %zu)\n",
+            Device.OMPX_TrackNumKernelLaunches.getName().data(), KTIR.size());
+    }
+    // TODO: Let users know how to serialize kernels
+  }
 };
 
 } // namespace plugin