13
13
#include < atomic>
14
14
#include < cassert>
15
15
#include < cstddef>
16
+ #include < cstdint>
16
17
#include < deque>
18
+ #include < functional>
17
19
#include < mutex>
18
20
#include < string>
19
21
#include < system_error>
20
22
#include < unistd.h>
21
23
#include < unordered_map>
22
24
25
+ #include " ErrorReporting.h"
23
26
#include " Shared/APITypes.h"
24
27
#include " Shared/Debug.h"
25
28
#include " Shared/Environment.h"
43
46
#include " llvm/Support/FileSystem.h"
44
47
#include " llvm/Support/MemoryBuffer.h"
45
48
#include " llvm/Support/Program.h"
49
+ #include " llvm/Support/Signals.h"
46
50
#include " llvm/Support/raw_ostream.h"
47
51
48
52
#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
685
689
AMDGPUQueueTy () : Queue(nullptr ), Mutex(), NumUsers(0 ) {}
686
690
687
691
// / Lazily initialize a new queue belonging to a specific agent.
688
- Error init (hsa_agent_t Agent, int32_t QueueSize) {
692
+ Error init (GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
689
693
if (Queue)
690
694
return Plugin::success ();
691
695
hsa_status_t Status =
692
696
hsa_queue_create (Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
693
- nullptr , UINT32_MAX, UINT32_MAX, &Queue);
697
+ &Device , UINT32_MAX, UINT32_MAX, &Queue);
694
698
return Plugin::check (Status, " Error in hsa_queue_create: %s" );
695
699
}
696
700
@@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
875
879
}
876
880
877
881
// / Callack that will be called when an error is detected on the HSA queue.
878
- static void callbackError (hsa_status_t Status, hsa_queue_t *Source, void *) {
879
- auto Err = Plugin::check (Status, " Received error in queue %p: %s" , Source);
880
- FATAL_MESSAGE (1 , " %s" , toString (std::move (Err)).data ());
881
- }
882
+ static void callbackError (hsa_status_t Status, hsa_queue_t *Source,
883
+ void *Data);
882
884
883
885
// / The HSA queue.
884
886
hsa_queue_t *Queue;
@@ -1214,6 +1216,9 @@ struct AMDGPUStreamTy {
1214
1216
// / Deinitialize the stream's signals.
1215
1217
Error deinit () { return Plugin::success (); }
1216
1218
1219
+ // / Return the associated (device) agent.
1220
+ hsa_agent_t getAgent () const { return Agent; }
1221
+
1217
1222
// / Attach an RPC server to this stream.
1218
1223
void setRPCServer (RPCServerTy *Server) { RPCServer = Server; }
1219
1224
@@ -1484,6 +1489,8 @@ struct AMDGPUStreamTy {
1484
1489
return true ;
1485
1490
}
1486
1491
1492
+ const AMDGPUQueueTy *getQueue () const { return Queue; }
1493
+
1487
1494
// / Record the state of the stream on an event.
1488
1495
Error recordEvent (AMDGPUEventTy &Event) const ;
1489
1496
@@ -1594,7 +1601,7 @@ struct AMDGPUStreamManagerTy final
1594
1601
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1595
1602
1596
1603
AMDGPUStreamManagerTy (GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1597
- : GenericDeviceResourceManagerTy(Device),
1604
+ : GenericDeviceResourceManagerTy(Device), Device(Device),
1598
1605
OMPX_QueueTracking (" LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING" , true ),
1599
1606
NextQueue(0 ), Agent(HSAAgent) {}
1600
1607
@@ -1603,7 +1610,7 @@ struct AMDGPUStreamManagerTy final
1603
1610
QueueSize = HSAQueueSize;
1604
1611
MaxNumQueues = NumHSAQueues;
1605
1612
// Initialize one queue eagerly
1606
- if (auto Err = Queues.front ().init (Agent, QueueSize))
1613
+ if (auto Err = Queues.front ().init (Device, Agent, QueueSize))
1607
1614
return Err;
1608
1615
1609
1616
return GenericDeviceResourceManagerTy::init (InitialSize);
@@ -1660,14 +1667,17 @@ struct AMDGPUStreamManagerTy final
1660
1667
}
1661
1668
1662
1669
// Make sure the queue is initialized, then add user & assign.
1663
- if (auto Err = Queues[Index].init (Agent, QueueSize))
1670
+ if (auto Err = Queues[Index].init (Device, Agent, QueueSize))
1664
1671
return Err;
1665
1672
Queues[Index].addUser ();
1666
1673
Stream->Queue = &Queues[Index];
1667
1674
1668
1675
return Plugin::success ();
1669
1676
}
1670
1677
1678
+ // / The device associated with this stream.
1679
+ GenericDeviceTy &Device;
1680
+
1671
1681
// / Envar for controlling the tracking of busy HSA queues.
1672
1682
BoolEnvar OMPX_QueueTracking;
1673
1683
@@ -3074,7 +3084,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
3074
3084
Initialized = true ;
3075
3085
3076
3086
// Register event handler to detect memory errors on the devices.
3077
- Status = hsa_amd_register_system_event_handler (eventHandler, nullptr );
3087
+ Status = hsa_amd_register_system_event_handler (eventHandler, this );
3078
3088
if (auto Err = Plugin::check (
3079
3089
Status, " Error in hsa_amd_register_system_event_handler: %s" ))
3080
3090
return std::move (Err);
@@ -3209,7 +3219,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
3209
3219
3210
3220
private:
3211
3221
// / Event handler that will be called by ROCr if an event is detected.
3212
- static hsa_status_t eventHandler (const hsa_amd_event_t *Event, void *) {
3222
+ static hsa_status_t eventHandler (const hsa_amd_event_t *Event,
3223
+ void *PluginPtr) {
3213
3224
if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
3214
3225
return HSA_STATUS_SUCCESS;
3215
3226
@@ -3240,6 +3251,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
3240
3251
uint32_t Node = -1 ;
3241
3252
hsa_agent_get_info (Event->memory_fault .agent , HSA_AGENT_INFO_NODE, &Node);
3242
3253
3254
+ AMDGPUPluginTy &Plugin = *reinterpret_cast <AMDGPUPluginTy *>(PluginPtr);
3255
+ for (uint32_t I = 0 , E = Plugin.getNumDevices ();
3256
+ Node != uint32_t (-1 ) && I < E; ++I) {
3257
+ AMDGPUDeviceTy &AMDGPUDevice =
3258
+ reinterpret_cast <AMDGPUDeviceTy &>(Plugin.getDevice (I));
3259
+ auto KernelTraceInfoRecord =
3260
+ AMDGPUDevice.KernelLaunchTraces .getExclusiveAccessor ();
3261
+
3262
+ uint32_t DeviceNode = -1 ;
3263
+ if (auto Err =
3264
+ AMDGPUDevice.getDeviceAttr (HSA_AGENT_INFO_NODE, DeviceNode)) {
3265
+ consumeError (std::move (Err));
3266
+ continue ;
3267
+ }
3268
+ if (DeviceNode != Node)
3269
+ continue ;
3270
+
3271
+ ErrorReporter::reportKernelTraces (AMDGPUDevice, *KernelTraceInfoRecord);
3272
+ }
3273
+
3243
3274
// Abort the execution since we do not recover from this error.
3244
3275
FATAL_MESSAGE (1 ,
3245
3276
" Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
@@ -3480,6 +3511,30 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
3480
3511
return Alloc;
3481
3512
}
3482
3513
3514
+ void AMDGPUQueueTy::callbackError (hsa_status_t Status, hsa_queue_t *Source,
3515
+ void *Data) {
3516
+ auto &AMDGPUDevice = *reinterpret_cast <AMDGPUDeviceTy *>(Data);
3517
+
3518
+ if (Status == HSA_STATUS_ERROR_EXCEPTION) {
3519
+ auto KernelTraceInfoRecord =
3520
+ AMDGPUDevice.KernelLaunchTraces .getExclusiveAccessor ();
3521
+ std::function<bool (__tgt_async_info &)> AsyncInfoWrapperMatcher =
3522
+ [=](__tgt_async_info &AsyncInfo) {
3523
+ auto *Stream = reinterpret_cast <AMDGPUStreamTy *>(AsyncInfo.Queue );
3524
+ if (!Stream)
3525
+ return false ;
3526
+ if (!Stream->getQueue ())
3527
+ return false ;
3528
+ return Stream->getQueue ()->Queue == Source;
3529
+ };
3530
+ ErrorReporter::reportTrapInKernel (AMDGPUDevice, *KernelTraceInfoRecord,
3531
+ AsyncInfoWrapperMatcher);
3532
+ }
3533
+
3534
+ auto Err = Plugin::check (Status, " Received error in queue %p: %s" , Source);
3535
+ FATAL_MESSAGE (1 , " %s" , toString (std::move (Err)).data ());
3536
+ }
3537
+
3483
3538
} // namespace plugin
3484
3539
} // namespace target
3485
3540
} // namespace omp
0 commit comments