Skip to content

Commit aaf2cb5

Browse files
doru1004yanyao-wang
authored andcommitted
[libomptarget] Apply patch 1022243 Delay first D2H async copy until the kernel has finished
Original change: https://gerrit-git.amd.com/c/lightning/ec/llvm-project/+/1022243 Change-Id: I18b6aac0335032a1bb3f7b5da96dd75d9c26b785
1 parent 0ff8f52 commit aaf2cb5

File tree

1 file changed

+17
-6
lines changed
  • openmp/libomptarget/plugins-nextgen/amdgpu/src

1 file changed

+17
-6
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,6 +1403,9 @@ struct AMDGPUStreamTy {
14031403
/// Mutex to protect stream's management.
14041404
mutable std::mutex Mutex;
14051405

1406+
/// Use synchronous copy back.
1407+
bool UseSyncCopyBack;
1408+
14061409
/// Timeout hint for HSA actively waiting for signal value to change
14071410
const uint64_t StreamBusyWaitMicroseconds;
14081411

@@ -1625,18 +1628,19 @@ struct AMDGPUStreamTy {
16251628
// Consume stream slot and compute dependencies.
16261629
auto [Curr, InputSignal] = consume(OutputSignal1);
16271630

1628-
// Avoid defining the input dependency if already satisfied.
1629-
if (InputSignal && !InputSignal->load())
1630-
InputSignal = nullptr;
1631-
16321631
// Setup the post action for releasing the intermediate buffer.
16331632
if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
16341633
return Err;
16351634

1635+
// Wait for kernel to finish before scheduling the asynchronous copy.
1636+
if (UseSyncCopyBack && InputSignal && InputSignal->load())
1637+
if (auto Err = InputSignal->wait(StreamBusyWaitMicroseconds, RPCHandle))
1638+
return Err;
1639+
16361640
// Issue the first step: device to host transfer. Avoid defining the input
16371641
// dependency if already satisfied.
16381642
hsa_status_t Status;
1639-
if (InputSignal) {
1643+
if (InputSignal && InputSignal->load()) {
16401644
hsa_signal_t InputSignalRaw = InputSignal->get();
16411645
Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
16421646
&InputSignalRaw, OutputSignal1->get());
@@ -2075,12 +2079,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
20752079
64),
20762080
OMPX_ForceSyncRegions("OMPX_FORCE_SYNC_REGIONS", 0),
20772081
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
2082+
OMPX_SyncCopyBack("LIBOMPTARGET_SYNC_COPY_BACK", true),
20782083
AMDGPUStreamManager(*this), AMDGPUEventManager(*this),
20792084
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice),
20802085
Queues() {}
20812086

20822087
~AMDGPUDeviceTy() {}
20832088

2089+
/// Return synchronous copy back status variable.
2090+
bool syncCopyBack() const { return OMPX_SyncCopyBack; }
2091+
20842092
/// Returns the maximum of HSA queues to create
20852093
/// This reads a non-cached environment variable, don't call everywhere.
20862094
uint32_t getMaxNumHsaQueues() const {
@@ -3141,6 +3149,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
31413149
/// are microseconds.
31423150
UInt32Envar OMPX_StreamBusyWait;
31433151

3152+
// Variable to hold synchronous copy back
3153+
BoolEnvar OMPX_SyncCopyBack;
3154+
31443155
/// Stream manager for AMDGPU streams.
31453156
AMDGPUStreamManagerTy AMDGPUStreamManager;
31463157

@@ -3280,7 +3291,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
32803291
// Initialize the std::deque with some empty positions.
32813292
Slots(32), NextSlot(0), SyncCycle(0), RPCHandle(nullptr),
32823293
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
3283-
Device(Device) {}
3294+
UseSyncCopyBack(Device.syncCopyBack()), Device(Device) {}
32843295

32853296
/// Class implementing the AMDGPU-specific functionalities of the global
32863297
/// handler.

0 commit comments

Comments
 (0)