@@ -583,10 +583,12 @@ using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
583
583
// / Class holding an HSA queue to submit kernel and barrier packets.
584
584
struct AMDGPUQueueTy {
585
585
// / Create an empty queue.
586
- AMDGPUQueueTy () : Queue(nullptr ), Mutex() {}
586
+ AMDGPUQueueTy () : Queue(nullptr ), Mutex(), NumUsers( 0 ) {}
587
587
588
- // / Initialize a new queue belonging to a specific agent.
588
+ // / Lazily initialize a new queue belonging to a specific agent.
589
589
Error init (hsa_agent_t Agent, int32_t QueueSize) {
590
+ if (Queue)
591
+ return Plugin::success ();
590
592
hsa_status_t Status =
591
593
hsa_queue_create (Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
592
594
nullptr , UINT32_MAX, UINT32_MAX, &Queue);
@@ -595,10 +597,22 @@ struct AMDGPUQueueTy {
595
597
596
598
// / Deinitialize the queue and destroy its resources.
597
599
Error deinit () {
600
+ std::lock_guard<std::mutex> Lock (Mutex);
601
+ if (!Queue)
602
+ return Plugin::success ();
598
603
hsa_status_t Status = hsa_queue_destroy (Queue);
599
604
return Plugin::check (Status, " Error in hsa_queue_destroy: %s" );
600
605
}
601
606
607
+ // / Returns if this queue is considered busy
608
+ bool isBusy () const { return NumUsers > 0 ; }
609
+
610
+ // / Decrement user count of the queue object
611
+ void removeUser () { --NumUsers; }
612
+
613
+ // / Increase user count of the queue object
614
+ void addUser () { ++NumUsers; }
615
+
602
616
// / Push a kernel launch to the queue. The kernel launch requires an output
603
617
// / signal and can define an optional input signal (nullptr if none).
604
618
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
@@ -611,6 +625,7 @@ struct AMDGPUQueueTy {
611
625
// the addition of other packets to the queue. The following piece of code
612
626
// should be lightweight; do not block the thread, allocate memory, etc.
613
627
std::lock_guard<std::mutex> Lock (Mutex);
628
+ assert (Queue && " Interacted with a non-initialized queue!" );
614
629
615
630
// Avoid defining the input dependency if already satisfied.
616
631
if (InputSignal && !InputSignal->load ())
@@ -659,6 +674,7 @@ struct AMDGPUQueueTy {
659
674
const AMDGPUSignalTy *InputSignal2) {
660
675
// Lock the queue during the packet publishing process.
661
676
std::lock_guard<std::mutex> Lock (Mutex);
677
+ assert (Queue && " Interacted with a non-initialized queue!" );
662
678
663
679
// Push the barrier with the lock acquired.
664
680
return pushBarrierImpl (OutputSignal, InputSignal1, InputSignal2);
@@ -777,6 +793,9 @@ struct AMDGPUQueueTy {
777
793
// / TODO: There are other more advanced approaches to avoid this mutex using
778
794
// / atomic operations. We can further investigate it if this is a bottleneck.
779
795
std::mutex Mutex;
796
+
797
+ // / Indicates that the queue is busy when > 0
798
+ int NumUsers;
780
799
};
781
800
782
801
// / Struct that implements a stream of asynchronous operations for AMDGPU
@@ -886,7 +905,7 @@ struct AMDGPUStreamTy {
886
905
hsa_agent_t Agent;
887
906
888
907
// / The queue that the stream uses to launch kernels.
889
- AMDGPUQueueTy & Queue;
908
+ AMDGPUQueueTy * Queue;
890
909
891
910
// / The manager of signals to reuse signals.
892
911
AMDGPUSignalManagerTy &SignalManager;
@@ -978,6 +997,9 @@ struct AMDGPUStreamTy {
978
997
// / signal of the current stream, and 2) the last signal of the other stream.
979
998
// / Use a barrier packet with two input signals.
980
999
Error waitOnStreamOperation (AMDGPUStreamTy &OtherStream, uint32_t Slot) {
1000
+ if (Queue == nullptr )
1001
+ return Plugin::error (" Target queue was nullptr" );
1002
+
981
1003
// / The signal that we must wait from the other stream.
982
1004
AMDGPUSignalTy *OtherSignal = OtherStream.Slots [Slot].Signal ;
983
1005
@@ -999,7 +1021,7 @@ struct AMDGPUStreamTy {
999
1021
return Err;
1000
1022
1001
1023
// Push a barrier into the queue with both input signals.
1002
- return Queue. pushBarrier (OutputSignal, InputSignal, OtherSignal);
1024
+ return Queue-> pushBarrier (OutputSignal, InputSignal, OtherSignal);
1003
1025
}
1004
1026
1005
1027
// / Callback for running a specific asynchronous operation. This callback is
@@ -1085,6 +1107,9 @@ struct AMDGPUStreamTy {
1085
1107
uint32_t NumThreads, uint64_t NumBlocks,
1086
1108
uint32_t GroupSize,
1087
1109
AMDGPUMemoryManagerTy &MemoryManager) {
1110
+ if (Queue == nullptr )
1111
+ return Plugin::error (" Target queue was nullptr" );
1112
+
1088
1113
// Retrieve an available signal for the operation's output.
1089
1114
AMDGPUSignalTy *OutputSignal = nullptr ;
1090
1115
if (auto Err = SignalManager.getResource (OutputSignal))
@@ -1102,8 +1127,8 @@ struct AMDGPUStreamTy {
1102
1127
return Err;
1103
1128
1104
1129
// Push the kernel with the output signal and an input signal (optional)
1105
- return Queue. pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
1106
- GroupSize, OutputSignal, InputSignal);
1130
+ return Queue-> pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
1131
+ GroupSize, OutputSignal, InputSignal);
1107
1132
}
1108
1133
1109
1134
// / Push an asynchronous memory copy between pinned memory buffers.
@@ -1331,6 +1356,8 @@ struct AMDGPUStreamTy {
1331
1356
1332
1357
// / Make the stream wait on an event.
1333
1358
Error waitEvent (const AMDGPUEventTy &Event);
1359
+
1360
+ friend struct AMDGPUStreamManagerTy ;
1334
1361
};
1335
1362
1336
1363
// / Class representing an event on AMDGPU. The event basically stores some
@@ -1428,6 +1455,99 @@ Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
1428
1455
return waitOnStreamOperation (RecordedStream, Event.RecordedSlot );
1429
1456
}
1430
1457
1458
+ struct AMDGPUStreamManagerTy final
1459
+ : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
1460
+ using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
1461
+ using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
1462
+
1463
+ AMDGPUStreamManagerTy (GenericDeviceTy &Device, hsa_agent_t HSAAgent)
1464
+ : GenericDeviceResourceManagerTy(Device), NextQueue(0 ), Agent(HSAAgent) {}
1465
+
1466
+ Error init (uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
1467
+ Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
1468
+ QueueSize = HSAQueueSize;
1469
+ MaxNumQueues = NumHSAQueues;
1470
+ // Initialize one queue eagerly
1471
+ if (auto Err = Queues.front ().init (Agent, QueueSize))
1472
+ return Err;
1473
+
1474
+ return GenericDeviceResourceManagerTy::init (InitialSize);
1475
+ }
1476
+
1477
+ // / Deinitialize the resource pool and delete all resources. This function
1478
+ // / must be called before the destructor.
1479
+ Error deinit () override {
1480
+ // De-init all queues
1481
+ for (AMDGPUQueueTy &Queue : Queues) {
1482
+ if (auto Err = Queue.deinit ())
1483
+ return Err;
1484
+ }
1485
+
1486
+ return GenericDeviceResourceManagerTy::deinit ();
1487
+ }
1488
+
1489
+ // / Get a single stream from the pool or create new resources.
1490
+ virtual Error getResource (AMDGPUStreamTy *&StreamHandle) override {
1491
+ return getResourcesImpl (1 , &StreamHandle, [this ](AMDGPUStreamTy *&Handle) {
1492
+ return assignNextQueue (Handle);
1493
+ });
1494
+ }
1495
+
1496
+ // / Return stream to the pool.
1497
+ virtual Error returnResource (AMDGPUStreamTy *StreamHandle) override {
1498
+ return returnResourceImpl (StreamHandle, [](AMDGPUStreamTy *Handle) {
1499
+ Handle->Queue ->removeUser ();
1500
+ return Plugin::success ();
1501
+ });
1502
+ }
1503
+
1504
+ private:
1505
+ // / Search for and assign an prefereably idle queue to the given Stream. If
1506
+ // / there is no queue without current users, resort to round robin selection.
1507
+ inline Error assignNextQueue (AMDGPUStreamTy *Stream) {
1508
+ uint32_t StartIndex = NextQueue % MaxNumQueues;
1509
+ AMDGPUQueueTy *Q = nullptr ;
1510
+
1511
+ for (int i = 0 ; i < MaxNumQueues; ++i) {
1512
+ Q = &Queues[StartIndex++];
1513
+ if (StartIndex == MaxNumQueues)
1514
+ StartIndex = 0 ;
1515
+
1516
+ if (Q->isBusy ())
1517
+ continue ;
1518
+ else {
1519
+ if (auto Err = Q->init (Agent, QueueSize))
1520
+ return Err;
1521
+
1522
+ Q->addUser ();
1523
+ Stream->Queue = Q;
1524
+ return Plugin::success ();
1525
+ }
1526
+ }
1527
+
1528
+ // All queues busy: Round robin (StartIndex has the initial value again)
1529
+ Queues[StartIndex].addUser ();
1530
+ Stream->Queue = &Queues[StartIndex];
1531
+ ++NextQueue;
1532
+ return Plugin::success ();
1533
+ }
1534
+
1535
+ // / The next queue index to use for round robin selection.
1536
+ uint32_t NextQueue;
1537
+
1538
+ // / The queues which are assigned to requested streams.
1539
+ std::vector<AMDGPUQueueTy> Queues;
1540
+
1541
+ // / The corresponding device as HSA agent.
1542
+ hsa_agent_t Agent;
1543
+
1544
+ // / The maximum number of queues.
1545
+ int MaxNumQueues;
1546
+
1547
+ // / The size of created queues.
1548
+ int QueueSize;
1549
+ };
1550
+
1431
1551
// / Abstract class that holds the common members of the actual kernel devices
1432
1552
// / and the host device. Both types should inherit from this class.
1433
1553
struct AMDGenericDeviceTy {
@@ -1607,9 +1727,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1607
1727
OMPX_InitialNumSignals (" LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" ,
1608
1728
64 ),
1609
1729
OMPX_StreamBusyWait (" LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000 ),
1610
- AMDGPUStreamManager (*this ), AMDGPUEventManager(*this ),
1611
- AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice),
1612
- Queues () {}
1730
+ AMDGPUStreamManager (*this , Agent), AMDGPUEventManager(*this ),
1731
+ AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice) {}
1613
1732
1614
1733
~AMDGPUDeviceTy () {}
1615
1734
@@ -1676,17 +1795,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1676
1795
return Err;
1677
1796
1678
1797
// Compute the number of queues and their size.
1679
- const uint32_t NumQueues = std::min (OMPX_NumQueues.get (), MaxQueues);
1680
- const uint32_t QueueSize = std::min (OMPX_QueueSize.get (), MaxQueueSize);
1681
-
1682
- // Construct and initialize each device queue.
1683
- Queues = std::vector<AMDGPUQueueTy>(NumQueues);
1684
- for (AMDGPUQueueTy &Queue : Queues)
1685
- if (auto Err = Queue.init (Agent, QueueSize))
1686
- return Err;
1798
+ OMPX_NumQueues = std::max (1U , std::min (OMPX_NumQueues.get (), MaxQueues));
1799
+ OMPX_QueueSize = std::min (OMPX_QueueSize.get (), MaxQueueSize);
1687
1800
1688
1801
// Initialize stream pool.
1689
- if (auto Err = AMDGPUStreamManager.init (OMPX_InitialNumStreams))
1802
+ if (auto Err = AMDGPUStreamManager.init (OMPX_InitialNumStreams,
1803
+ OMPX_NumQueues, OMPX_QueueSize))
1690
1804
return Err;
1691
1805
1692
1806
// Initialize event pool.
@@ -1725,11 +1839,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1725
1839
}
1726
1840
}
1727
1841
1728
- for (AMDGPUQueueTy &Queue : Queues) {
1729
- if (auto Err = Queue.deinit ())
1730
- return Err;
1731
- }
1732
-
1733
1842
// Invalidate agent reference.
1734
1843
Agent = {0 };
1735
1844
@@ -2416,19 +2525,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2416
2525
});
2417
2526
}
2418
2527
2419
- // / Get the next queue in a round-robin fashion.
2420
- AMDGPUQueueTy &getNextQueue () {
2421
- static std::atomic<uint32_t > NextQueue (0 );
2422
-
2423
- uint32_t Current = NextQueue.fetch_add (1 , std::memory_order_relaxed);
2424
- return Queues[Current % Queues.size ()];
2425
- }
2426
-
2427
2528
private:
2428
- using AMDGPUStreamRef = AMDGPUResourceRef<AMDGPUStreamTy>;
2429
2529
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2430
-
2431
- using AMDGPUStreamManagerTy = GenericDeviceResourceManagerTy<AMDGPUStreamRef>;
2432
2530
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
2433
2531
2434
2532
// / Envar for controlling the number of HSA queues per device. High number of
@@ -2484,9 +2582,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2484
2582
2485
2583
// / Reference to the host device.
2486
2584
AMDHostDeviceTy &HostDevice;
2487
-
2488
- // / List of device packet queues.
2489
- std::vector<AMDGPUQueueTy> Queues;
2490
2585
};
2491
2586
2492
2587
Error AMDGPUDeviceImageTy::loadExecutable (const AMDGPUDeviceTy &Device) {
@@ -2558,7 +2653,7 @@ Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
2558
2653
}
2559
2654
2560
2655
AMDGPUStreamTy::AMDGPUStreamTy (AMDGPUDeviceTy &Device)
2561
- : Agent(Device.getAgent()), Queue(Device.getNextQueue() ),
2656
+ : Agent(Device.getAgent()), Queue(nullptr ),
2562
2657
SignalManager (Device.getSignalManager()), Device(Device),
2563
2658
// Initialize the std::deque with some empty positions.
2564
2659
Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer(nullptr ),
0 commit comments