Skip to content

Commit 02fada7

Browse files
committed
[OpenMP][libomptarget] Enable parallel copies via multiple SDMA engines
This enables the AMDGPU plugin to use a new ROCm 5.7 interface to dispatch asynchronous data transfers across SDMA engines. The default functionality stays unchanged, meaning that all data transfers are enqueued into a H2D queue or an D2H queue, depending on transfer direction, via the HSA interface used previously. The new interface can be enabled via the environment variable LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true when libomptarget is built against a recent ROCm version (5.7 and later). As of now, requests are distributed in a round-robin fashion across available SDMA engines.
1 parent e3d750c commit 02fada7

File tree

1 file changed

+79
-44
lines changed
  • openmp/libomptarget/plugins-nextgen/amdgpu/src

1 file changed

+79
-44
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 79 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,38 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
130130
"Error in hsa_amd_agent_iterate_memory_pools: %s");
131131
}
132132

133+
/// Dispatches an asynchronous memory copy
134+
/// Enables different SDMA engines for the dispatch in a round-robin fashion.
135+
Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
136+
const void *Src, hsa_agent_t SrcAgent, size_t Size,
137+
uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
138+
hsa_signal_t CompletionSignal) {
139+
if (UseMultipleSdmaEngines) {
140+
hsa_status_t S =
141+
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
142+
NumDepSignals, DepSignals, CompletionSignal);
143+
return Plugin::check(S, "Error in hsa_amd_memory_async_copy");
144+
}
145+
146+
// This solution is probably not the best
147+
#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
148+
HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
149+
return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
150+
#else
151+
static int SdmaEngine = 1;
152+
153+
// This call is only avail in ROCm >= 5.7
154+
hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
155+
Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
156+
CompletionSignal, (hsa_amd_sdma_engine_id_t)SdmaEngine,
157+
/*force_copy_on_sdma=*/true);
158+
// Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
159+
SdmaEngine = (SdmaEngine << 1) % 7;
160+
161+
return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine");
162+
#endif
163+
}
164+
133165
} // namespace utils
134166

135167
/// Utility class representing generic resource references to AMDGPU resources.
@@ -945,6 +977,9 @@ struct AMDGPUStreamTy {
945977
/// Timeout hint for HSA actively waiting for signal value to change
946978
const uint64_t StreamBusyWaitMicroseconds;
947979

980+
/// Indicate to spread data transfers across all avilable SDMAs
981+
bool UseMultipleSdmaEngines;
982+
948983
/// Return the current number of asychronous operations on the stream.
949984
uint32_t size() const { return NextSlot; }
950985

@@ -1170,15 +1205,15 @@ struct AMDGPUStreamTy {
11701205
InputSignal = nullptr;
11711206

11721207
// Issue the async memory copy.
1173-
hsa_status_t Status;
11741208
if (InputSignal) {
11751209
hsa_signal_t InputSignalRaw = InputSignal->get();
1176-
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
1177-
&InputSignalRaw, OutputSignal->get());
1178-
} else
1179-
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
1180-
nullptr, OutputSignal->get());
1181-
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
1210+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1211+
CopySize, 1, &InputSignalRaw,
1212+
OutputSignal->get());
1213+
}
1214+
1215+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1216+
CopySize, 0, nullptr, OutputSignal->get());
11821217
}
11831218

11841219
/// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1214,21 +1249,19 @@ struct AMDGPUStreamTy {
12141249

12151250
// Issue the first step: device to host transfer. Avoid defining the input
12161251
// dependency if already satisfied.
1217-
hsa_status_t Status;
12181252
if (InputSignal) {
12191253
hsa_signal_t InputSignalRaw = InputSignal->get();
1220-
Status =
1221-
hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
1222-
&InputSignalRaw, OutputSignals[0]->get());
1254+
if (auto Err = utils::asyncMemCopy(
1255+
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
1256+
&InputSignalRaw, OutputSignals[0]->get()))
1257+
return Err;
12231258
} else {
1224-
Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
1225-
nullptr, OutputSignals[0]->get());
1259+
if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
1260+
Src, Agent, CopySize, 0, nullptr,
1261+
OutputSignals[0]->get()))
1262+
return Err;
12261263
}
12271264

1228-
if (auto Err =
1229-
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
1230-
return Err;
1231-
12321265
// Consume another stream slot and compute dependencies.
12331266
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
12341267
assert(InputSignal && "Invalid input signal");
@@ -1242,7 +1275,7 @@ struct AMDGPUStreamTy {
12421275
std::atomic_thread_fence(std::memory_order_release);
12431276

12441277
// Issue the second step: host to host transfer.
1245-
Status = hsa_amd_signal_async_handler(
1278+
hsa_status_t Status = hsa_amd_signal_async_handler(
12461279
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
12471280
(void *)&Slots[Curr]);
12481281

@@ -1318,16 +1351,14 @@ struct AMDGPUStreamTy {
13181351

13191352
// Issue the second step: host to device transfer. Avoid defining the input
13201353
// dependency if already satisfied.
1321-
hsa_status_t Status;
13221354
if (InputSignal && InputSignal->load()) {
13231355
hsa_signal_t InputSignalRaw = InputSignal->get();
1324-
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
1325-
&InputSignalRaw, OutputSignal->get());
1326-
} else
1327-
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
1328-
nullptr, OutputSignal->get());
1329-
1330-
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
1356+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1357+
Agent, CopySize, 1, &InputSignalRaw,
1358+
OutputSignal->get());
1359+
}
1360+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1361+
CopySize, 0, nullptr, OutputSignal->get());
13311362
}
13321363

13331364
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1353,17 +1384,15 @@ struct AMDGPUStreamTy {
13531384
// allocated by this runtime or the caller made the appropriate
13541385
// access calls.
13551386

1356-
hsa_status_t Status;
13571387
if (InputSignal && InputSignal->load()) {
13581388
hsa_signal_t InputSignalRaw = InputSignal->get();
1359-
Status =
1360-
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
1361-
&InputSignalRaw, OutputSignal->get());
1362-
} else
1363-
Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
1364-
0, nullptr, OutputSignal->get());
1365-
1366-
return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
1389+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1390+
SrcAgent, CopySize, 1, &InputSignalRaw,
1391+
OutputSignal->get());
1392+
}
1393+
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1394+
SrcAgent, CopySize, 0, nullptr,
1395+
OutputSignal->get());
13671396
}
13681397

13691398
/// Synchronize with the stream. The current thread waits until all operations
@@ -1788,6 +1817,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
17881817
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
17891818
64),
17901819
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1820+
OMPX_UseMultipleSdmaEngines(
1821+
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
17911822
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
17921823
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
17931824

@@ -2196,10 +2227,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
21962227
if (auto Err = Signal.init())
21972228
return Err;
21982229

2199-
Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
2200-
0, nullptr, Signal.get());
2201-
if (auto Err =
2202-
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
2230+
if (auto Err = utils::asyncMemCopy(getUseMultipleSdmaEngines(), TgtPtr,
2231+
Agent, PinnedPtr, Agent, Size, 0,
2232+
nullptr, Signal.get()))
22032233
return Err;
22042234

22052235
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2257,10 +2287,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22572287
if (auto Err = Signal.init())
22582288
return Err;
22592289

2260-
Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
2261-
0, nullptr, Signal.get());
2262-
if (auto Err =
2263-
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
2290+
if (auto Err = utils::asyncMemCopy(getUseMultipleSdmaEngines(), PinnedPtr,
2291+
Agent, TgtPtr, Agent, Size, 0, nullptr,
2292+
Signal.get()))
22642293
return Err;
22652294

22662295
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2623,6 +2652,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26232652
});
26242653
}
26252654

2655+
bool getUseMultipleSdmaEngines() { return OMPX_UseMultipleSdmaEngines; }
2656+
26262657
private:
26272658
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
26282659
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2660,6 +2691,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26602691
/// are microseconds.
26612692
UInt32Envar OMPX_StreamBusyWait;
26622693

2694+
/// Use ROCm 5.7 interface for multiple SDMA engines
2695+
BoolEnvar OMPX_UseMultipleSdmaEngines;
2696+
26632697
/// Stream manager for AMDGPU streams.
26642698
AMDGPUStreamManagerTy AMDGPUStreamManager;
26652699

@@ -2761,7 +2795,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
27612795
SignalManager(Device.getSignalManager()), Device(Device),
27622796
// Initialize the std::deque with some empty positions.
27632797
Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
2764-
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
2798+
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2799+
UseMultipleSdmaEngines(Device.getUseMultipleSdmaEngines()) {}
27652800

27662801
/// Class implementing the AMDGPU-specific functionalities of the global
27672802
/// handler.

0 commit comments

Comments
 (0)