Skip to content

Commit cc8dc5e

Browse files
carlobertolliJonChesterfield
authored andcommitted
[OpenMP][AMDGPU] Switch host-device memory copy to asynchronous version
Prepare amdgpu plugin for asynchronous implementation. This patch switches to using HSA API for asynchronous memory copy. Moving away from hsa_memory_copy means that plugin is responsible for locking/unlocking host memory pointers. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D115279
1 parent 04e79cf commit cc8dc5e

File tree

5 files changed

+94
-54
lines changed

5 files changed

+94
-54
lines changed

openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ DLWRAP(hsa_amd_memory_pool_free, 1);
4747
DLWRAP(hsa_amd_memory_async_copy, 8);
4848
DLWRAP(hsa_amd_memory_pool_get_info, 3);
4949
DLWRAP(hsa_amd_agents_allow_access, 4);
50+
DLWRAP(hsa_amd_memory_lock, 5);
51+
DLWRAP(hsa_amd_memory_unlock, 1);
5052
DLWRAP(hsa_amd_memory_fill, 3);
5153
DLWRAP(hsa_amd_register_system_event_handler, 2);
5254

openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ hsa_status_t hsa_amd_agents_allow_access(uint32_t num_agents,
7676
const uint32_t *flags,
7777
const void *ptr);
7878

79+
hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
80+
hsa_agent_t* agents, int num_agent,
81+
void** agent_ptr);
82+
83+
hsa_status_t hsa_amd_memory_unlock(void* host_ptr);
84+
7985
hsa_status_t hsa_amd_memory_fill(void *ptr, uint32_t value, size_t count);
8086

8187
typedef enum hsa_amd_event_type_s {

openmp/libomptarget/plugins/amdgpu/impl/impl.cpp

Lines changed: 75 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,39 +5,34 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
77
//===----------------------------------------------------------------------===//
8-
#include "hsa_api.h"
9-
#include "impl_runtime.h"
10-
#include "internal.h"
118
#include "rt.h"
129
#include <memory>
1310

1411
/*
1512
* Data
1613
*/
1714

18-
static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest,
19-
const void *src, size_t size,
20-
hsa_agent_t agent) {
15+
// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
16+
static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
17+
hsa_agent_t agent, const void *src,
18+
size_t size) {
2119
const hsa_signal_value_t init = 1;
2220
const hsa_signal_value_t success = 0;
23-
hsa_signal_store_screlease(sig, init);
21+
hsa_signal_store_screlease(signal, init);
2422

25-
hsa_status_t err =
26-
hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig);
27-
if (err != HSA_STATUS_SUCCESS) {
23+
hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
24+
nullptr, signal);
25+
if (err != HSA_STATUS_SUCCESS)
2826
return err;
29-
}
3027

3128
// async_copy reports success by decrementing and failure by setting to < 0
3229
hsa_signal_value_t got = init;
33-
while (got == init) {
34-
got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init,
30+
while (got == init)
31+
got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
3532
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
36-
}
3733

38-
if (got != success) {
34+
if (got != success)
3935
return HSA_STATUS_ERROR;
40-
}
4136

4237
return err;
4338
}
@@ -48,19 +43,58 @@ struct implFreePtrDeletor {
4843
}
4944
};
5045

46+
enum CopyDirection { H2D, D2H };
47+
48+
static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
49+
hsa_signal_t signal, void *dest,
50+
hsa_agent_t agent, void *src,
51+
void *lockingPtr, size_t size) {
52+
hsa_status_t err;
53+
54+
void *lockedPtr = nullptr;
55+
err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr);
56+
if (err != HSA_STATUS_SUCCESS)
57+
return err;
58+
59+
switch (direction) {
60+
case H2D:
61+
err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
62+
break;
63+
case D2H:
64+
err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
65+
break;
66+
default:
67+
err = HSA_STATUS_ERROR; // fall into unlock before returning
68+
}
69+
70+
if (err != HSA_STATUS_SUCCESS) {
71+
// do not leak locked host pointers, but discard potential error message
72+
hsa_amd_memory_unlock(lockingPtr);
73+
return err;
74+
}
75+
76+
err = hsa_amd_memory_unlock(lockingPtr);
77+
if (err != HSA_STATUS_SUCCESS)
78+
return err;
79+
80+
return HSA_STATUS_SUCCESS;
81+
}
82+
5183
hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
52-
const void *hostSrc, size_t size,
53-
hsa_agent_t agent,
84+
void *hostSrc, size_t size,
85+
hsa_agent_t device_agent,
5486
hsa_amd_memory_pool_t MemoryPool) {
55-
hsa_status_t rc = hsa_memory_copy(deviceDest, hostSrc, size);
87+
hsa_status_t err;
5688

57-
// hsa_memory_copy sometimes fails in situations where
89+
err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
90+
device_agent, hostSrc, hostSrc, size);
91+
92+
if (err == HSA_STATUS_SUCCESS)
93+
return err;
94+
95+
// async memcpy sometimes fails in situations where
5896
// allocate + copy succeeds. Looks like it might be related to
5997
// locking part of a read only segment. Fall back for now.
60-
if (rc == HSA_STATUS_SUCCESS) {
61-
return HSA_STATUS_SUCCESS;
62-
}
63-
6498
void *tempHostPtr;
6599
hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
66100
if (ret != HSA_STATUS_SUCCESS) {
@@ -70,26 +104,26 @@ hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
70104
std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
71105
memcpy(tempHostPtr, hostSrc, size);
72106

73-
if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) !=
74-
HSA_STATUS_SUCCESS) {
75-
return HSA_STATUS_ERROR;
76-
}
77-
return HSA_STATUS_SUCCESS;
107+
return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
108+
device_agent, tempHostPtr, tempHostPtr, size);
78109
}
79110

80-
hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest,
81-
const void *deviceSrc, size_t size,
82-
hsa_agent_t agent,
111+
hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
112+
void *deviceSrc, size_t size,
113+
hsa_agent_t deviceAgent,
83114
hsa_amd_memory_pool_t MemoryPool) {
84-
hsa_status_t rc = hsa_memory_copy(dest, deviceSrc, size);
115+
hsa_status_t err;
116+
117+
// device has always visibility over both pointers, so use that
118+
err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
119+
deviceSrc, hostDest, size);
120+
121+
if (err == HSA_STATUS_SUCCESS)
122+
return err;
85123

86124
// hsa_memory_copy sometimes fails in situations where
87125
// allocate + copy succeeds. Looks like it might be related to
88126
// locking part of a read only segment. Fall back for now.
89-
if (rc == HSA_STATUS_SUCCESS) {
90-
return HSA_STATUS_SUCCESS;
91-
}
92-
93127
void *tempHostPtr;
94128
hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
95129
if (ret != HSA_STATUS_SUCCESS) {
@@ -98,11 +132,11 @@ hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest,
98132
}
99133
std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
100134

101-
if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) !=
102-
HSA_STATUS_SUCCESS) {
135+
err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
136+
deviceAgent, deviceSrc, tempHostPtr, size);
137+
if (err != HSA_STATUS_SUCCESS)
103138
return HSA_STATUS_ERROR;
104-
}
105139

106-
memcpy(dest, tempHostPtr, size);
140+
memcpy(hostDest, tempHostPtr, size);
107141
return HSA_STATUS_SUCCESS;
108142
}

openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,12 @@ hsa_status_t impl_module_register_from_memory_to_place(
1919
void *cb_state);
2020

2121
hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
22-
const void *hostSrc, size_t size,
23-
hsa_agent_t agent,
22+
void *hostSrc, size_t size,
23+
hsa_agent_t device_agent,
2424
hsa_amd_memory_pool_t MemoryPool);
2525

26-
hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest,
27-
const void *deviceSrc, size_t size,
28-
hsa_agent_t agent,
26+
hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc,
27+
size_t size, hsa_agent_t device_agent,
2928
hsa_amd_memory_pool_t MemoryPool);
3029
}
3130

openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -464,10 +464,9 @@ class RTLDeviceInfoTy {
464464
"");
465465
static const int Default_WG_Size = getGridValue<64>().GV_Default_WG_Size;
466466

467-
using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
468-
size_t size, hsa_agent_t,
469-
hsa_amd_memory_pool_t);
470-
hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
467+
using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t size,
468+
hsa_agent_t, hsa_amd_memory_pool_t);
469+
hsa_status_t freesignalpool_memcpy(void *dest, void *src, size_t size,
471470
MemcpyFunc Func, int32_t deviceId) {
472471
hsa_agent_t agent = HSAAgents[deviceId];
473472
hsa_signal_t s = FreeSignalPool.pop();
@@ -479,13 +478,13 @@ class RTLDeviceInfoTy {
479478
return r;
480479
}
481480

482-
hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
483-
size_t size, int32_t deviceId) {
481+
hsa_status_t freesignalpool_memcpy_d2h(void *dest, void *src, size_t size,
482+
int32_t deviceId) {
484483
return freesignalpool_memcpy(dest, src, size, impl_memcpy_d2h, deviceId);
485484
}
486485

487-
hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
488-
size_t size, int32_t deviceId) {
486+
hsa_status_t freesignalpool_memcpy_h2d(void *dest, void *src, size_t size,
487+
int32_t deviceId) {
489488
return freesignalpool_memcpy(dest, src, size, impl_memcpy_h2d, deviceId);
490489
}
491490

0 commit comments

Comments
 (0)