Skip to content

Commit cc9e19e

Browse files
committed
Revert "[OpenMP][libomptarget] Enable parallel copies via multiple SDMA engines (#71801)"
This causes the tests to fail because the bots were not updated in time. Revert until we update the bots to a valid version. This reverts commit e876250.
1 parent 1654d7d commit cc9e19e

File tree

1 file changed

+44
-86
lines changed
  • openmp/libomptarget/plugins-nextgen/amdgpu/src

1 file changed

+44
-86
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 44 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -130,45 +130,6 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
130130
"Error in hsa_amd_agent_iterate_memory_pools: %s");
131131
}
132132

133-
/// Dispatches an asynchronous memory copy.
134-
/// Enables different SDMA engines for the dispatch in a round-robin fashion.
135-
Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
136-
const void *Src, hsa_agent_t SrcAgent, size_t Size,
137-
uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
138-
hsa_signal_t CompletionSignal) {
139-
if (UseMultipleSdmaEngines) {
140-
hsa_status_t S =
141-
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
142-
NumDepSignals, DepSignals, CompletionSignal);
143-
return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
144-
}
145-
146-
// This solution is probably not the best
147-
#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
148-
HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
149-
return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
150-
#else
151-
static std::atomic<int> SdmaEngine{1};
152-
153-
// This atomics solution is probably not the best, but should be sufficient
154-
// for now.
155-
// In a worst case scenario, in which threads read the same value, they will
156-
// dispatch to the same SDMA engine. This may result in sub-optimal
157-
// performance. However, I think the possibility to be fairly low.
158-
int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
159-
// This call is only avail in ROCm >= 5.7
160-
hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
161-
Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
162-
CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
163-
/*force_copy_on_sdma=*/true);
164-
// Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
165-
LocalSdmaEngine = (LocalSdmaEngine << 1) % 7;
166-
SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
167-
168-
return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
169-
#endif
170-
}
171-
172133
} // namespace utils
173134

174135
/// Utility class representing generic resource references to AMDGPU resources.
@@ -984,9 +945,6 @@ struct AMDGPUStreamTy {
984945
/// Timeout hint for HSA actively waiting for signal value to change
985946
const uint64_t StreamBusyWaitMicroseconds;
986947

987-
/// Indicate to spread data transfers across all avilable SDMAs
988-
bool UseMultipleSdmaEngines;
989-
990948
/// Return the current number of asychronous operations on the stream.
991949
uint32_t size() const { return NextSlot; }
992950

@@ -1212,15 +1170,15 @@ struct AMDGPUStreamTy {
12121170
InputSignal = nullptr;
12131171

12141172
// Issue the async memory copy.
1173+
hsa_status_t Status;
12151174
if (InputSignal) {
12161175
hsa_signal_t InputSignalRaw = InputSignal->get();
1217-
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1218-
CopySize, 1, &InputSignalRaw,
1219-
OutputSignal->get());
1220-
}
1221-
1222-
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1223-
CopySize, 0, nullptr, OutputSignal->get());
1176+
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
1177+
&InputSignalRaw, OutputSignal->get());
1178+
} else
1179+
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
1180+
nullptr, OutputSignal->get());
1181+
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
12241182
}
12251183

12261184
/// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1256,19 +1214,21 @@ struct AMDGPUStreamTy {
12561214

12571215
// Issue the first step: device to host transfer. Avoid defining the input
12581216
// dependency if already satisfied.
1217+
hsa_status_t Status;
12591218
if (InputSignal) {
12601219
hsa_signal_t InputSignalRaw = InputSignal->get();
1261-
if (auto Err = utils::asyncMemCopy(
1262-
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
1263-
&InputSignalRaw, OutputSignals[0]->get()))
1264-
return Err;
1220+
Status =
1221+
hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
1222+
&InputSignalRaw, OutputSignals[0]->get());
12651223
} else {
1266-
if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
1267-
Src, Agent, CopySize, 0, nullptr,
1268-
OutputSignals[0]->get()))
1269-
return Err;
1224+
Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
1225+
nullptr, OutputSignals[0]->get());
12701226
}
12711227

1228+
if (auto Err =
1229+
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
1230+
return Err;
1231+
12721232
// Consume another stream slot and compute dependencies.
12731233
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
12741234
assert(InputSignal && "Invalid input signal");
@@ -1282,7 +1242,7 @@ struct AMDGPUStreamTy {
12821242
std::atomic_thread_fence(std::memory_order_release);
12831243

12841244
// Issue the second step: host to host transfer.
1285-
hsa_status_t Status = hsa_amd_signal_async_handler(
1245+
Status = hsa_amd_signal_async_handler(
12861246
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
12871247
(void *)&Slots[Curr]);
12881248

@@ -1358,14 +1318,16 @@ struct AMDGPUStreamTy {
13581318

13591319
// Issue the second step: host to device transfer. Avoid defining the input
13601320
// dependency if already satisfied.
1321+
hsa_status_t Status;
13611322
if (InputSignal && InputSignal->load()) {
13621323
hsa_signal_t InputSignalRaw = InputSignal->get();
1363-
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1364-
Agent, CopySize, 1, &InputSignalRaw,
1365-
OutputSignal->get());
1366-
}
1367-
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1368-
CopySize, 0, nullptr, OutputSignal->get());
1324+
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
1325+
&InputSignalRaw, OutputSignal->get());
1326+
} else
1327+
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
1328+
nullptr, OutputSignal->get());
1329+
1330+
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
13691331
}
13701332

13711333
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1391,15 +1353,17 @@ struct AMDGPUStreamTy {
13911353
// allocated by this runtime or the caller made the appropriate
13921354
// access calls.
13931355

1356+
hsa_status_t Status;
13941357
if (InputSignal && InputSignal->load()) {
13951358
hsa_signal_t InputSignalRaw = InputSignal->get();
1396-
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1397-
SrcAgent, CopySize, 1, &InputSignalRaw,
1398-
OutputSignal->get());
1399-
}
1400-
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
1401-
SrcAgent, CopySize, 0, nullptr,
1402-
OutputSignal->get());
1359+
Status =
1360+
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
1361+
&InputSignalRaw, OutputSignal->get());
1362+
} else
1363+
Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
1364+
0, nullptr, OutputSignal->get());
1365+
1366+
return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
14031367
}
14041368

14051369
/// Synchronize with the stream. The current thread waits until all operations
@@ -1824,8 +1788,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
18241788
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
18251789
64),
18261790
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
1827-
OMPX_UseMultipleSdmaEngines(
1828-
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
18291791
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
18301792
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
18311793

@@ -2244,9 +2206,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22442206
if (auto Err = Signal.init())
22452207
return Err;
22462208

2247-
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
2248-
Agent, PinnedPtr, Agent, Size, 0,
2249-
nullptr, Signal.get()))
2209+
Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
2210+
0, nullptr, Signal.get());
2211+
if (auto Err =
2212+
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
22502213
return Err;
22512214

22522215
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2304,9 +2267,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
23042267
if (auto Err = Signal.init())
23052268
return Err;
23062269

2307-
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
2308-
Agent, TgtPtr, Agent, Size, 0, nullptr,
2309-
Signal.get()))
2270+
Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
2271+
0, nullptr, Signal.get());
2272+
if (auto Err =
2273+
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
23102274
return Err;
23112275

23122276
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2669,8 +2633,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26692633
});
26702634
}
26712635

2672-
bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
2673-
26742636
private:
26752637
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
26762638
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2740,9 +2702,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27402702
/// are microseconds.
27412703
UInt32Envar OMPX_StreamBusyWait;
27422704

2743-
/// Use ROCm 5.7 interface for multiple SDMA engines
2744-
BoolEnvar OMPX_UseMultipleSdmaEngines;
2745-
27462705
/// Stream manager for AMDGPU streams.
27472706
AMDGPUStreamManagerTy AMDGPUStreamManager;
27482707

@@ -2844,8 +2803,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
28442803
SignalManager(Device.getSignalManager()), Device(Device),
28452804
// Initialize the std::deque with some empty positions.
28462805
Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
2847-
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2848-
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
2806+
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
28492807

28502808
/// Class implementing the AMDGPU-specific functionalities of the global
28512809
/// handler.

0 commit comments

Comments
 (0)