@@ -130,6 +130,38 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
130
130
" Error in hsa_amd_agent_iterate_memory_pools: %s" );
131
131
}
132
132
133
+ // / Dispatches an asynchronous memory copy
134
+ // / Enables different SDMA engines for the dispatch in a round-robin fashion.
135
+ Error asyncMemCopy (bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
136
+ const void *Src, hsa_agent_t SrcAgent, size_t Size,
137
+ uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
138
+ hsa_signal_t CompletionSignal) {
139
+ if (UseMultipleSdmaEngines) {
140
+ hsa_status_t S =
141
+ hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, Size,
142
+ NumDepSignals, DepSignals, CompletionSignal);
143
+ return Plugin::check (S, " Error in hsa_amd_memory_async_copy" );
144
+ }
145
+
146
+ // This solution is probably not the best
147
+ #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
148
+ HSA_AMD_INTERFACE_VERSION_MINOR >= 2 )
149
+ return Plugin::error (" Async copy on selected SDMA requires ROCm 5.7" );
150
+ #else
151
+ static int SdmaEngine = 1 ;
152
+
153
+ // This call is only avail in ROCm >= 5.7
154
+ hsa_status_t S = hsa_amd_memory_async_copy_on_engine (
155
+ Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
156
+ CompletionSignal, (hsa_amd_sdma_engine_id_t )SdmaEngine,
157
+ /* force_copy_on_sdma=*/ true );
158
+ // Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
159
+ SdmaEngine = (SdmaEngine << 1 ) % 7 ;
160
+
161
+ return Plugin::check (S, " Error in hsa_amd_memory_async_copy_on_engine" );
162
+ #endif
163
+ }
164
+
133
165
} // namespace utils
134
166
135
167
// / Utility class representing generic resource references to AMDGPU resources.
@@ -945,6 +977,9 @@ struct AMDGPUStreamTy {
945
977
// / Timeout hint for HSA actively waiting for signal value to change
946
978
const uint64_t StreamBusyWaitMicroseconds;
947
979
980
+ // / Indicate to spread data transfers across all avilable SDMAs
981
+ bool UseMultipleSdmaEngines;
982
+
948
983
// / Return the current number of asychronous operations on the stream.
949
984
uint32_t size () const { return NextSlot; }
950
985
@@ -1170,15 +1205,15 @@ struct AMDGPUStreamTy {
1170
1205
InputSignal = nullptr ;
1171
1206
1172
1207
// Issue the async memory copy.
1173
- hsa_status_t Status;
1174
1208
if (InputSignal) {
1175
1209
hsa_signal_t InputSignalRaw = InputSignal->get ();
1176
- Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 1 ,
1177
- &InputSignalRaw, OutputSignal->get ());
1178
- } else
1179
- Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 0 ,
1180
- nullptr , OutputSignal->get ());
1181
- return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
1210
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1211
+ CopySize, 1 , &InputSignalRaw,
1212
+ OutputSignal->get ());
1213
+ }
1214
+
1215
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1216
+ CopySize, 0 , nullptr , OutputSignal->get ());
1182
1217
}
1183
1218
1184
1219
// / Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1214,21 +1249,19 @@ struct AMDGPUStreamTy {
1214
1249
1215
1250
// Issue the first step: device to host transfer. Avoid defining the input
1216
1251
// dependency if already satisfied.
1217
- hsa_status_t Status;
1218
1252
if (InputSignal) {
1219
1253
hsa_signal_t InputSignalRaw = InputSignal->get ();
1220
- Status =
1221
- hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 1 ,
1222
- &InputSignalRaw, OutputSignals[0 ]->get ());
1254
+ if (auto Err = utils::asyncMemCopy (
1255
+ UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1 ,
1256
+ &InputSignalRaw, OutputSignals[0 ]->get ()))
1257
+ return Err;
1223
1258
} else {
1224
- Status = hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 0 ,
1225
- nullptr , OutputSignals[0 ]->get ());
1259
+ if (auto Err = utils::asyncMemCopy (UseMultipleSdmaEngines, Inter, Agent,
1260
+ Src, Agent, CopySize, 0 , nullptr ,
1261
+ OutputSignals[0 ]->get ()))
1262
+ return Err;
1226
1263
}
1227
1264
1228
- if (auto Err =
1229
- Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
1230
- return Err;
1231
-
1232
1265
// Consume another stream slot and compute dependencies.
1233
1266
std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
1234
1267
assert (InputSignal && " Invalid input signal" );
@@ -1242,7 +1275,7 @@ struct AMDGPUStreamTy {
1242
1275
std::atomic_thread_fence (std::memory_order_release);
1243
1276
1244
1277
// Issue the second step: host to host transfer.
1245
- Status = hsa_amd_signal_async_handler (
1278
+ hsa_status_t Status = hsa_amd_signal_async_handler (
1246
1279
InputSignal->get (), HSA_SIGNAL_CONDITION_EQ, 0 , asyncActionCallback,
1247
1280
(void *)&Slots[Curr]);
1248
1281
@@ -1318,16 +1351,14 @@ struct AMDGPUStreamTy {
1318
1351
1319
1352
// Issue the second step: host to device transfer. Avoid defining the input
1320
1353
// dependency if already satisfied.
1321
- hsa_status_t Status;
1322
1354
if (InputSignal && InputSignal->load ()) {
1323
1355
hsa_signal_t InputSignalRaw = InputSignal->get ();
1324
- Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 1 ,
1325
- &InputSignalRaw, OutputSignal->get ());
1326
- } else
1327
- Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 0 ,
1328
- nullptr , OutputSignal->get ());
1329
-
1330
- return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
1356
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter,
1357
+ Agent, CopySize, 1 , &InputSignalRaw,
1358
+ OutputSignal->get ());
1359
+ }
1360
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1361
+ CopySize, 0 , nullptr , OutputSignal->get ());
1331
1362
}
1332
1363
1333
1364
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1353,17 +1384,15 @@ struct AMDGPUStreamTy {
1353
1384
// allocated by this runtime or the caller made the appropriate
1354
1385
// access calls.
1355
1386
1356
- hsa_status_t Status;
1357
1387
if (InputSignal && InputSignal->load ()) {
1358
1388
hsa_signal_t InputSignalRaw = InputSignal->get ();
1359
- Status =
1360
- hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize, 1 ,
1361
- &InputSignalRaw, OutputSignal->get ());
1362
- } else
1363
- Status = hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize,
1364
- 0 , nullptr , OutputSignal->get ());
1365
-
1366
- return Plugin::check (Status, " Error in D2D hsa_amd_memory_async_copy: %s" );
1389
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1390
+ SrcAgent, CopySize, 1 , &InputSignalRaw,
1391
+ OutputSignal->get ());
1392
+ }
1393
+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1394
+ SrcAgent, CopySize, 0 , nullptr ,
1395
+ OutputSignal->get ());
1367
1396
}
1368
1397
1369
1398
// / Synchronize with the stream. The current thread waits until all operations
@@ -1788,6 +1817,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1788
1817
OMPX_InitialNumSignals (" LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" ,
1789
1818
64 ),
1790
1819
OMPX_StreamBusyWait (" LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000 ),
1820
+ OMPX_UseMultipleSdmaEngines (
1821
+ " LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , false ),
1791
1822
AMDGPUStreamManager (*this , Agent), AMDGPUEventManager(*this ),
1792
1823
AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice) {}
1793
1824
@@ -2196,10 +2227,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2196
2227
if (auto Err = Signal.init ())
2197
2228
return Err;
2198
2229
2199
- Status = hsa_amd_memory_async_copy (TgtPtr, Agent, PinnedPtr, Agent, Size,
2200
- 0 , nullptr , Signal.get ());
2201
- if (auto Err =
2202
- Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
2230
+ if (auto Err = utils::asyncMemCopy (getUseMultipleSdmaEngines (), TgtPtr,
2231
+ Agent, PinnedPtr, Agent, Size, 0 ,
2232
+ nullptr , Signal.get ()))
2203
2233
return Err;
2204
2234
2205
2235
if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2257,10 +2287,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2257
2287
if (auto Err = Signal.init ())
2258
2288
return Err;
2259
2289
2260
- Status = hsa_amd_memory_async_copy (PinnedPtr, Agent, TgtPtr, Agent, Size,
2261
- 0 , nullptr , Signal.get ());
2262
- if (auto Err =
2263
- Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
2290
+ if (auto Err = utils::asyncMemCopy (getUseMultipleSdmaEngines (), PinnedPtr,
2291
+ Agent, TgtPtr, Agent, Size, 0 , nullptr ,
2292
+ Signal.get ()))
2264
2293
return Err;
2265
2294
2266
2295
if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2623,6 +2652,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2623
2652
});
2624
2653
}
2625
2654
2655
+ bool getUseMultipleSdmaEngines () { return OMPX_UseMultipleSdmaEngines; }
2656
+
2626
2657
private:
2627
2658
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2628
2659
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2660,6 +2691,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2660
2691
// / are microseconds.
2661
2692
UInt32Envar OMPX_StreamBusyWait;
2662
2693
2694
+ // / Use ROCm 5.7 interface for multiple SDMA engines
2695
+ BoolEnvar OMPX_UseMultipleSdmaEngines;
2696
+
2663
2697
// / Stream manager for AMDGPU streams.
2664
2698
AMDGPUStreamManagerTy AMDGPUStreamManager;
2665
2699
@@ -2761,7 +2795,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
2761
2795
SignalManager (Device.getSignalManager()), Device(Device),
2762
2796
// Initialize the std::deque with some empty positions.
2763
2797
Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer(nullptr ),
2764
- StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
2798
+ StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2799
+ UseMultipleSdmaEngines(Device.getUseMultipleSdmaEngines()) {}
2765
2800
2766
2801
// / Class implementing the AMDGPU-specific functionalities of the global
2767
2802
// / handler.
0 commit comments