13
13
#include < atomic>
14
14
#include < cassert>
15
15
#include < cstddef>
16
+ #include < cstdint>
16
17
#include < deque>
18
+ #include < functional>
17
19
#include < mutex>
18
20
#include < string>
19
21
#include < system_error>
20
22
#include < unistd.h>
21
23
#include < unordered_map>
22
24
25
+ #include " ErrorReporting.h"
23
26
#include " Shared/APITypes.h"
24
27
#include " Shared/Debug.h"
25
28
#include " Shared/Environment.h"
43
46
#include " llvm/Support/FileSystem.h"
44
47
#include " llvm/Support/MemoryBuffer.h"
45
48
#include " llvm/Support/Program.h"
49
+ #include " llvm/Support/Signals.h"
46
50
#include " llvm/Support/raw_ostream.h"
47
51
48
52
#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
685
689
AMDGPUQueueTy () : Queue(nullptr ), Mutex(), NumUsers(0 ) {}
686
690
687
691
// / Lazily initialize a new queue belonging to a specific agent.
688
- Error init (hsa_agent_t Agent, int32_t QueueSize) {
692
+ Error init (GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
689
693
if (Queue)
690
694
return Plugin::success ();
691
695
hsa_status_t Status =
692
696
hsa_queue_create (Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
693
- nullptr , UINT32_MAX, UINT32_MAX, &Queue);
697
+ &Device , UINT32_MAX, UINT32_MAX, &Queue);
694
698
return Plugin::check (Status, " Error in hsa_queue_create: %s" );
695
699
}
696
700
@@ -875,10 +879,8 @@ struct AMDGPUQueueTy {
875
879
}
876
880
877
881
// / Callack that will be called when an error is detected on the HSA queue.
878
- static void callbackError (hsa_status_t Status, hsa_queue_t *Source, void *) {
879
- auto Err = Plugin::check (Status, " Received error in queue %p: %s" , Source);
880
- FATAL_MESSAGE (1 , " %s" , toString (std::move (Err)).data ());
881
- }
882
+ static void callbackError (hsa_status_t Status, hsa_queue_t *Source,
883
+ void *Data);
882
884
883
885
// / The HSA queue.
884
886
hsa_queue_t *Queue;
@@ -1484,6 +1486,8 @@ struct AMDGPUStreamTy {
1484
1486
return true ;
1485
1487
}
1486
1488
1489
+ const AMDGPUQueueTy *getQueue () const { return Queue; }
1490
+
1487
1491
// / Record the state of the stream on an event.
1488
1492
Error recordEvent (AMDGPUEventTy &Event) const ;
1489
1493
@@ -1594,7 +1598,7 @@ struct AMDGPUStreamManagerTy final
1594
1598
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1595
1599
1596
1600
AMDGPUStreamManagerTy (GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1597
- : GenericDeviceResourceManagerTy(Device),
1601
+ : GenericDeviceResourceManagerTy(Device), Device(Device),
1598
1602
OMPX_QueueTracking (" LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING" , true ),
1599
1603
NextQueue(0 ), Agent(HSAAgent) {}
1600
1604
@@ -1603,7 +1607,7 @@ struct AMDGPUStreamManagerTy final
1603
1607
QueueSize = HSAQueueSize;
1604
1608
MaxNumQueues = NumHSAQueues;
1605
1609
// Initialize one queue eagerly
1606
- if (auto Err = Queues.front ().init (Agent, QueueSize))
1610
+ if (auto Err = Queues.front ().init (Device, Agent, QueueSize))
1607
1611
return Err;
1608
1612
1609
1613
return GenericDeviceResourceManagerTy::init (InitialSize);
@@ -1660,14 +1664,17 @@ struct AMDGPUStreamManagerTy final
1660
1664
}
1661
1665
1662
1666
// Make sure the queue is initialized, then add user & assign.
1663
- if (auto Err = Queues[Index].init (Agent, QueueSize))
1667
+ if (auto Err = Queues[Index].init (Device, Agent, QueueSize))
1664
1668
return Err;
1665
1669
Queues[Index].addUser ();
1666
1670
Stream->Queue = &Queues[Index];
1667
1671
1668
1672
return Plugin::success ();
1669
1673
}
1670
1674
1675
+ // / The device associated with this stream.
1676
+ GenericDeviceTy &Device;
1677
+
1671
1678
// / Envar for controlling the tracking of busy HSA queues.
1672
1679
BoolEnvar OMPX_QueueTracking;
1673
1680
@@ -3074,7 +3081,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
3074
3081
Initialized = true ;
3075
3082
3076
3083
// Register event handler to detect memory errors on the devices.
3077
- Status = hsa_amd_register_system_event_handler (eventHandler, nullptr );
3084
+ Status = hsa_amd_register_system_event_handler (eventHandler, this );
3078
3085
if (auto Err = Plugin::check (
3079
3086
Status, " Error in hsa_amd_register_system_event_handler: %s" ))
3080
3087
return std::move (Err);
@@ -3209,7 +3216,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
3209
3216
3210
3217
private:
3211
3218
// / Event handler that will be called by ROCr if an event is detected.
3212
- static hsa_status_t eventHandler (const hsa_amd_event_t *Event, void *) {
3219
+ static hsa_status_t eventHandler (const hsa_amd_event_t *Event,
3220
+ void *PluginPtr) {
3213
3221
if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
3214
3222
return HSA_STATUS_SUCCESS;
3215
3223
@@ -3240,6 +3248,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
3240
3248
uint32_t Node = -1 ;
3241
3249
hsa_agent_get_info (Event->memory_fault .agent , HSA_AGENT_INFO_NODE, &Node);
3242
3250
3251
+ AMDGPUPluginTy &Plugin = *reinterpret_cast <AMDGPUPluginTy *>(PluginPtr);
3252
+ for (uint32_t I = 0 , E = Plugin.getNumDevices ();
3253
+ Node != uint32_t (-1 ) && I < E; ++I) {
3254
+ AMDGPUDeviceTy &AMDGPUDevice =
3255
+ reinterpret_cast <AMDGPUDeviceTy &>(Plugin.getDevice (I));
3256
+ auto KernelTraceInfoRecord =
3257
+ AMDGPUDevice.KernelLaunchTraces .getExclusiveAccessor ();
3258
+
3259
+ uint32_t DeviceNode = -1 ;
3260
+ if (auto Err =
3261
+ AMDGPUDevice.getDeviceAttr (HSA_AGENT_INFO_NODE, DeviceNode)) {
3262
+ consumeError (std::move (Err));
3263
+ continue ;
3264
+ }
3265
+ if (DeviceNode != Node)
3266
+ continue ;
3267
+
3268
+ ErrorReporter::reportKernelTraces (AMDGPUDevice, *KernelTraceInfoRecord);
3269
+ }
3270
+
3243
3271
// Abort the execution since we do not recover from this error.
3244
3272
FATAL_MESSAGE (1 ,
3245
3273
" Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
@@ -3480,6 +3508,30 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
3480
3508
return Alloc;
3481
3509
}
3482
3510
3511
+ void AMDGPUQueueTy::callbackError (hsa_status_t Status, hsa_queue_t *Source,
3512
+ void *Data) {
3513
+ auto &AMDGPUDevice = *reinterpret_cast <AMDGPUDeviceTy *>(Data);
3514
+
3515
+ if (Status == HSA_STATUS_ERROR_EXCEPTION) {
3516
+ auto KernelTraceInfoRecord =
3517
+ AMDGPUDevice.KernelLaunchTraces .getExclusiveAccessor ();
3518
+ std::function<bool (__tgt_async_info &)> AsyncInfoWrapperMatcher =
3519
+ [=](__tgt_async_info &AsyncInfo) {
3520
+ auto *Stream = reinterpret_cast <AMDGPUStreamTy *>(AsyncInfo.Queue );
3521
+ if (!Stream)
3522
+ return false ;
3523
+ if (!Stream->getQueue ())
3524
+ return false ;
3525
+ return Stream->getQueue ()->Queue == Source;
3526
+ };
3527
+ ErrorReporter::reportTrapInKernel (AMDGPUDevice, *KernelTraceInfoRecord,
3528
+ AsyncInfoWrapperMatcher);
3529
+ }
3530
+
3531
+ auto Err = Plugin::check (Status, " Received error in queue %p: %s" , Source);
3532
+ FATAL_MESSAGE (1 , " %s" , toString (std::move (Err)).data ());
3533
+ }
3534
+
3483
3535
} // namespace plugin
3484
3536
} // namespace target
3485
3537
} // namespace omp
0 commit comments