Skip to content

Commit 840d0b7

Browse files
[amdgpu] D2D memcpy via streams and HSA (#69977)
hsa_amd_memory_async_copy can handle device to device copies if passed the corresponding parameters. No functional change - currently D2D copy goes through a fallback in libomptarget that stages through a host malloc, after this it goes directly through HSA. Works under exactly the situations that HSA works. Verified locally on a performance benchmark. Hoping to attract further testing from internal developers after it lands.
1 parent f0f5fdf commit 840d0b7

File tree

1 file changed

+51
-8
lines changed
  • openmp/libomptarget/plugins-nextgen/amdgpu/src

1 file changed

+51
-8
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1331,6 +1331,42 @@ struct AMDGPUStreamTy {
13311331
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
13321332
}
13331333

1334+
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1335+
Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
1336+
hsa_agent_t SrcAgent, uint64_t CopySize) {
1337+
AMDGPUSignalTy *OutputSignal;
1338+
if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
1339+
return Err;
1340+
OutputSignal->reset();
1341+
OutputSignal->increaseUseCount();
1342+
1343+
std::lock_guard<std::mutex> Lock(Mutex);
1344+
1345+
// Consume stream slot and compute dependencies.
1346+
auto [Curr, InputSignal] = consume(OutputSignal);
1347+
1348+
// Avoid defining the input dependency if already satisfied.
1349+
if (InputSignal && !InputSignal->load())
1350+
InputSignal = nullptr;
1351+
1352+
// The agents need to have access to the corresponding memory
1353+
// This is presently only true if the pointers were originally
1354+
// allocated by this runtime or the caller made the appropriate
1355+
// access calls.
1356+
1357+
hsa_status_t Status;
1358+
if (InputSignal && InputSignal->load()) {
1359+
hsa_signal_t InputSignalRaw = InputSignal->get();
1360+
Status =
1361+
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
1362+
&InputSignalRaw, OutputSignal->get());
1363+
} else
1364+
Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
1365+
0, nullptr, OutputSignal->get());
1366+
1367+
return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
1368+
}
1369+
13341370
/// Synchronize with the stream. The current thread waits until all operations
13351371
/// are finalized and it performs the pending post actions (i.e., releasing
13361372
/// intermediate buffers).
@@ -2250,14 +2286,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22502286
PinnedMemoryManager);
22512287
}
22522288

2253-
/// Exchange data between two devices within the plugin. This function is not
2254-
/// supported in this plugin.
2289+
/// Exchange data between two devices within the plugin.
22552290
Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
22562291
void *DstPtr, int64_t Size,
22572292
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2258-
// This function should never be called because the function
2259-
// AMDGPUPluginTy::isDataExchangable() returns false.
2260-
return Plugin::error("dataExchangeImpl not supported");
2293+
AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
2294+
2295+
AMDGPUStreamTy *Stream = nullptr;
2296+
if (auto Err = getStream(AsyncInfoWrapper, Stream))
2297+
return Err;
2298+
if (Size <= 0)
2299+
return Plugin::success();
2300+
2301+
return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
2302+
getAgent(), (uint64_t)Size);
22612303
}
22622304

22632305
/// Initialize the async info for interoperability purposes.
@@ -2897,9 +2939,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
28972939
return true;
28982940
}
28992941

2900-
/// This plugin does not support exchanging data between two devices.
29012942
bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
2902-
return false;
2943+
return true;
29032944
}
29042945

29052946
/// Get the host device instance.
@@ -3174,8 +3215,10 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
31743215
return nullptr;
31753216
}
31763217

3177-
if (Alloc && (Kind == TARGET_ALLOC_HOST || Kind == TARGET_ALLOC_SHARED)) {
3218+
if (Alloc) {
31783219
auto &KernelAgents = Plugin::get<AMDGPUPluginTy>().getKernelAgents();
3220+
// Inherently necessary for host or shared allocations
3221+
// Also enabled for device memory to allow device to device memcpy
31793222

31803223
// Enable all kernel agents to access the buffer.
31813224
if (auto Err = MemoryPool->enableAccess(Alloc, Size, KernelAgents)) {

0 commit comments

Comments
 (0)