@@ -1403,6 +1403,9 @@ struct AMDGPUStreamTy {
1403
1403
// / Mutex to protect stream's management.
1404
1404
mutable std::mutex Mutex;
1405
1405
1406
+ // / Use synchronous copy back.
1407
+ bool UseSyncCopyBack;
1408
+
1406
1409
// / Timeout hint for HSA actively waiting for signal value to change
1407
1410
const uint64_t StreamBusyWaitMicroseconds;
1408
1411
@@ -1625,18 +1628,19 @@ struct AMDGPUStreamTy {
1625
1628
// Consume stream slot and compute dependencies.
1626
1629
auto [Curr, InputSignal] = consume (OutputSignal1);
1627
1630
1628
- // Avoid defining the input dependency if already satisfied.
1629
- if (InputSignal && !InputSignal->load ())
1630
- InputSignal = nullptr ;
1631
-
1632
1631
// Setup the post action for releasing the intermediate buffer.
1633
1632
if (auto Err = Slots[Curr].schedReleaseBuffer (Inter, MemoryManager))
1634
1633
return Err;
1635
1634
1635
+ // Wait for kernel to finish before scheduling the asynchronous copy.
1636
+ if (UseSyncCopyBack && InputSignal && InputSignal->load ())
1637
+ if (auto Err = InputSignal->wait (StreamBusyWaitMicroseconds, RPCHandle))
1638
+ return Err;
1639
+
1636
1640
// Issue the first step: device to host transfer. Avoid defining the input
1637
1641
// dependency if already satisfied.
1638
1642
hsa_status_t Status;
1639
- if (InputSignal) {
1643
+ if (InputSignal && InputSignal-> load () ) {
1640
1644
hsa_signal_t InputSignalRaw = InputSignal->get ();
1641
1645
Status = hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 1 ,
1642
1646
&InputSignalRaw, OutputSignal1->get ());
@@ -2075,12 +2079,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2075
2079
64 ),
2076
2080
OMPX_ForceSyncRegions (" OMPX_FORCE_SYNC_REGIONS" , 0 ),
2077
2081
OMPX_StreamBusyWait (" LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000 ),
2082
+ OMPX_SyncCopyBack (" LIBOMPTARGET_SYNC_COPY_BACK" , true ),
2078
2083
AMDGPUStreamManager (*this ), AMDGPUEventManager(*this ),
2079
2084
AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice),
2080
2085
Queues () {}
2081
2086
2082
2087
~AMDGPUDeviceTy () {}
2083
2088
2089
+ // / Return synchronous copy back status variable.
2090
+ bool syncCopyBack () const { return OMPX_SyncCopyBack; }
2091
+
2084
2092
// / Returns the maximum of HSA queues to create
2085
2093
// / This reads a non-cached environment variable, don't call everywhere.
2086
2094
uint32_t getMaxNumHsaQueues () const {
@@ -3141,6 +3149,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3141
3149
// / are microseconds.
3142
3150
UInt32Envar OMPX_StreamBusyWait;
3143
3151
3152
+ // Variable to hold synchronous copy back
3153
+ BoolEnvar OMPX_SyncCopyBack;
3154
+
3144
3155
// / Stream manager for AMDGPU streams.
3145
3156
AMDGPUStreamManagerTy AMDGPUStreamManager;
3146
3157
@@ -3280,7 +3291,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
3280
3291
// Initialize the std::deque with some empty positions.
3281
3292
Slots (32 ), NextSlot(0 ), SyncCycle(0 ), RPCHandle(nullptr ),
3282
3293
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
3283
- Device(Device) {}
3294
+ UseSyncCopyBack(Device.syncCopyBack()), Device(Device) {}
3284
3295
3285
3296
// / Class implementing the AMDGPU-specific functionalities of the global
3286
3297
// / handler.
0 commit comments