@@ -130,45 +130,6 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
130
130
" Error in hsa_amd_agent_iterate_memory_pools: %s" );
131
131
}
132
132
133
- // / Dispatches an asynchronous memory copy.
134
- // / Enables different SDMA engines for the dispatch in a round-robin fashion.
135
- Error asyncMemCopy (bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
136
- const void *Src, hsa_agent_t SrcAgent, size_t Size,
137
- uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
138
- hsa_signal_t CompletionSignal) {
139
- if (UseMultipleSdmaEngines) {
140
- hsa_status_t S =
141
- hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, Size,
142
- NumDepSignals, DepSignals, CompletionSignal);
143
- return Plugin::check (S, " Error in hsa_amd_memory_async_copy: %s" );
144
- }
145
-
146
- // This solution is probably not the best
147
- #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
148
- HSA_AMD_INTERFACE_VERSION_MINOR >= 2 )
149
- return Plugin::error (" Async copy on selected SDMA requires ROCm 5.7" );
150
- #else
151
- static std::atomic<int > SdmaEngine{1 };
152
-
153
- // This atomics solution is probably not the best, but should be sufficient
154
- // for now.
155
- // In a worst case scenario, in which threads read the same value, they will
156
- // dispatch to the same SDMA engine. This may result in sub-optimal
157
- // performance. However, I think the possibility to be fairly low.
158
- int LocalSdmaEngine = SdmaEngine.load (std::memory_order_acquire);
159
- // This call is only avail in ROCm >= 5.7
160
- hsa_status_t S = hsa_amd_memory_async_copy_on_engine (
161
- Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
162
- CompletionSignal, (hsa_amd_sdma_engine_id_t )LocalSdmaEngine,
163
- /* force_copy_on_sdma=*/ true );
164
- // Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
165
- LocalSdmaEngine = (LocalSdmaEngine << 1 ) % 7 ;
166
- SdmaEngine.store (LocalSdmaEngine, std::memory_order_relaxed);
167
-
168
- return Plugin::check (S, " Error in hsa_amd_memory_async_copy_on_engine: %s" );
169
- #endif
170
- }
171
-
172
133
} // namespace utils
173
134
174
135
// / Utility class representing generic resource references to AMDGPU resources.
@@ -984,9 +945,6 @@ struct AMDGPUStreamTy {
984
945
// / Timeout hint for HSA actively waiting for signal value to change
985
946
const uint64_t StreamBusyWaitMicroseconds;
986
947
987
- // / Indicate to spread data transfers across all avilable SDMAs
988
- bool UseMultipleSdmaEngines;
989
-
990
948
// / Return the current number of asychronous operations on the stream.
991
949
uint32_t size () const { return NextSlot; }
992
950
@@ -1212,15 +1170,15 @@ struct AMDGPUStreamTy {
1212
1170
InputSignal = nullptr ;
1213
1171
1214
1172
// Issue the async memory copy.
1173
+ hsa_status_t Status;
1215
1174
if (InputSignal) {
1216
1175
hsa_signal_t InputSignalRaw = InputSignal->get ();
1217
- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1218
- CopySize, 1 , &InputSignalRaw,
1219
- OutputSignal->get ());
1220
- }
1221
-
1222
- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1223
- CopySize, 0 , nullptr , OutputSignal->get ());
1176
+ Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 1 ,
1177
+ &InputSignalRaw, OutputSignal->get ());
1178
+ } else
1179
+ Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 0 ,
1180
+ nullptr , OutputSignal->get ());
1181
+ return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
1224
1182
}
1225
1183
1226
1184
// / Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1256,19 +1214,21 @@ struct AMDGPUStreamTy {
1256
1214
1257
1215
// Issue the first step: device to host transfer. Avoid defining the input
1258
1216
// dependency if already satisfied.
1217
+ hsa_status_t Status;
1259
1218
if (InputSignal) {
1260
1219
hsa_signal_t InputSignalRaw = InputSignal->get ();
1261
- if (auto Err = utils::asyncMemCopy (
1262
- UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1 ,
1263
- &InputSignalRaw, OutputSignals[0 ]->get ()))
1264
- return Err;
1220
+ Status =
1221
+ hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 1 ,
1222
+ &InputSignalRaw, OutputSignals[0 ]->get ());
1265
1223
} else {
1266
- if (auto Err = utils::asyncMemCopy (UseMultipleSdmaEngines, Inter, Agent,
1267
- Src, Agent, CopySize, 0 , nullptr ,
1268
- OutputSignals[0 ]->get ()))
1269
- return Err;
1224
+ Status = hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 0 ,
1225
+ nullptr , OutputSignals[0 ]->get ());
1270
1226
}
1271
1227
1228
+ if (auto Err =
1229
+ Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
1230
+ return Err;
1231
+
1272
1232
// Consume another stream slot and compute dependencies.
1273
1233
std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
1274
1234
assert (InputSignal && " Invalid input signal" );
@@ -1282,7 +1242,7 @@ struct AMDGPUStreamTy {
1282
1242
std::atomic_thread_fence (std::memory_order_release);
1283
1243
1284
1244
// Issue the second step: host to host transfer.
1285
- hsa_status_t Status = hsa_amd_signal_async_handler (
1245
+ Status = hsa_amd_signal_async_handler (
1286
1246
InputSignal->get (), HSA_SIGNAL_CONDITION_EQ, 0 , asyncActionCallback,
1287
1247
(void *)&Slots[Curr]);
1288
1248
@@ -1358,14 +1318,16 @@ struct AMDGPUStreamTy {
1358
1318
1359
1319
// Issue the second step: host to device transfer. Avoid defining the input
1360
1320
// dependency if already satisfied.
1321
+ hsa_status_t Status;
1361
1322
if (InputSignal && InputSignal->load ()) {
1362
1323
hsa_signal_t InputSignalRaw = InputSignal->get ();
1363
- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter,
1364
- Agent, CopySize, 1 , &InputSignalRaw,
1365
- OutputSignal->get ());
1366
- }
1367
- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1368
- CopySize, 0 , nullptr , OutputSignal->get ());
1324
+ Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 1 ,
1325
+ &InputSignalRaw, OutputSignal->get ());
1326
+ } else
1327
+ Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 0 ,
1328
+ nullptr , OutputSignal->get ());
1329
+
1330
+ return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
1369
1331
}
1370
1332
1371
1333
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1391,15 +1353,17 @@ struct AMDGPUStreamTy {
1391
1353
// allocated by this runtime or the caller made the appropriate
1392
1354
// access calls.
1393
1355
1356
+ hsa_status_t Status;
1394
1357
if (InputSignal && InputSignal->load ()) {
1395
1358
hsa_signal_t InputSignalRaw = InputSignal->get ();
1396
- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1397
- SrcAgent, CopySize, 1 , &InputSignalRaw,
1398
- OutputSignal->get ());
1399
- }
1400
- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1401
- SrcAgent, CopySize, 0 , nullptr ,
1402
- OutputSignal->get ());
1359
+ Status =
1360
+ hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize, 1 ,
1361
+ &InputSignalRaw, OutputSignal->get ());
1362
+ } else
1363
+ Status = hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize,
1364
+ 0 , nullptr , OutputSignal->get ());
1365
+
1366
+ return Plugin::check (Status, " Error in D2D hsa_amd_memory_async_copy: %s" );
1403
1367
}
1404
1368
1405
1369
// / Synchronize with the stream. The current thread waits until all operations
@@ -1824,8 +1788,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
1824
1788
OMPX_InitialNumSignals (" LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" ,
1825
1789
64 ),
1826
1790
OMPX_StreamBusyWait (" LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000 ),
1827
- OMPX_UseMultipleSdmaEngines (
1828
- " LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , false ),
1829
1791
AMDGPUStreamManager (*this , Agent), AMDGPUEventManager(*this ),
1830
1792
AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice) {}
1831
1793
@@ -2244,9 +2206,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2244
2206
if (auto Err = Signal.init ())
2245
2207
return Err;
2246
2208
2247
- if (auto Err = utils::asyncMemCopy (useMultipleSdmaEngines (), TgtPtr,
2248
- Agent, PinnedPtr, Agent, Size, 0 ,
2249
- nullptr , Signal.get ()))
2209
+ Status = hsa_amd_memory_async_copy (TgtPtr, Agent, PinnedPtr, Agent, Size,
2210
+ 0 , nullptr , Signal.get ());
2211
+ if (auto Err =
2212
+ Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
2250
2213
return Err;
2251
2214
2252
2215
if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2304,9 +2267,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2304
2267
if (auto Err = Signal.init ())
2305
2268
return Err;
2306
2269
2307
- if (auto Err = utils::asyncMemCopy (useMultipleSdmaEngines (), PinnedPtr,
2308
- Agent, TgtPtr, Agent, Size, 0 , nullptr ,
2309
- Signal.get ()))
2270
+ Status = hsa_amd_memory_async_copy (PinnedPtr, Agent, TgtPtr, Agent, Size,
2271
+ 0 , nullptr , Signal.get ());
2272
+ if (auto Err =
2273
+ Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
2310
2274
return Err;
2311
2275
2312
2276
if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2669,8 +2633,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2669
2633
});
2670
2634
}
2671
2635
2672
- bool useMultipleSdmaEngines () const { return OMPX_UseMultipleSdmaEngines; }
2673
-
2674
2636
private:
2675
2637
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
2676
2638
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2740,9 +2702,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2740
2702
// / are microseconds.
2741
2703
UInt32Envar OMPX_StreamBusyWait;
2742
2704
2743
- // / Use ROCm 5.7 interface for multiple SDMA engines
2744
- BoolEnvar OMPX_UseMultipleSdmaEngines;
2745
-
2746
2705
// / Stream manager for AMDGPU streams.
2747
2706
AMDGPUStreamManagerTy AMDGPUStreamManager;
2748
2707
@@ -2844,8 +2803,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
2844
2803
SignalManager (Device.getSignalManager()), Device(Device),
2845
2804
// Initialize the std::deque with some empty positions.
2846
2805
Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer(nullptr ),
2847
- StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2848
- UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
2806
+ StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
2849
2807
2850
2808
// / Class implementing the AMDGPU-specific functionalities of the global
2851
2809
// / handler.
0 commit comments