@@ -1331,6 +1331,42 @@ struct AMDGPUStreamTy {
1331
1331
return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
1332
1332
}
1333
1333
1334
+ // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
1335
+ Error pushMemoryCopyD2DAsync (void *Dst, hsa_agent_t DstAgent, const void *Src,
1336
+ hsa_agent_t SrcAgent, uint64_t CopySize) {
1337
+ AMDGPUSignalTy *OutputSignal;
1338
+ if (auto Err = SignalManager.getResources (/* Num=*/ 1 , &OutputSignal))
1339
+ return Err;
1340
+ OutputSignal->reset ();
1341
+ OutputSignal->increaseUseCount ();
1342
+
1343
+ std::lock_guard<std::mutex> Lock (Mutex);
1344
+
1345
+ // Consume stream slot and compute dependencies.
1346
+ auto [Curr, InputSignal] = consume (OutputSignal);
1347
+
1348
+ // Avoid defining the input dependency if already satisfied.
1349
+ if (InputSignal && !InputSignal->load ())
1350
+ InputSignal = nullptr ;
1351
+
1352
+ // The agents need to have access to the corresponding memory
1353
+ // This is presently only true if the pointers were originally
1354
+ // allocated by this runtime or the caller made the appropriate
1355
+ // access calls.
1356
+
1357
+ hsa_status_t Status;
1358
+ if (InputSignal && InputSignal->load ()) {
1359
+ hsa_signal_t InputSignalRaw = InputSignal->get ();
1360
+ Status =
1361
+ hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize, 1 ,
1362
+ &InputSignalRaw, OutputSignal->get ());
1363
+ } else
1364
+ Status = hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize,
1365
+ 0 , nullptr , OutputSignal->get ());
1366
+
1367
+ return Plugin::check (Status, " Error in D2D hsa_amd_memory_async_copy: %s" );
1368
+ }
1369
+
1334
1370
// / Synchronize with the stream. The current thread waits until all operations
1335
1371
// / are finalized and it performs the pending post actions (i.e., releasing
1336
1372
// / intermediate buffers).
@@ -2250,14 +2286,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2250
2286
PinnedMemoryManager);
2251
2287
}
2252
2288
2253
- // / Exchange data between two devices within the plugin. This function is not
2254
- // / supported in this plugin.
2289
+ // / Exchange data between two devices within the plugin.
2255
2290
Error dataExchangeImpl (const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
2256
2291
void *DstPtr, int64_t Size,
2257
2292
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2258
- // This function should never be called because the function
2259
- // AMDGPUPluginTy::isDataExchangable() returns false.
2260
- return Plugin::error (" dataExchangeImpl not supported" );
2293
+ AMDGPUDeviceTy &DstDevice = static_cast <AMDGPUDeviceTy &>(DstGenericDevice);
2294
+
2295
+ AMDGPUStreamTy *Stream = nullptr ;
2296
+ if (auto Err = getStream (AsyncInfoWrapper, Stream))
2297
+ return Err;
2298
+ if (Size <= 0 )
2299
+ return Plugin::success ();
2300
+
2301
+ return Stream->pushMemoryCopyD2DAsync (DstPtr, DstDevice.getAgent (), SrcPtr,
2302
+ getAgent (), (uint64_t )Size);
2261
2303
}
2262
2304
2263
2305
// / Initialize the async info for interoperability purposes.
@@ -2897,9 +2939,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
2897
2939
return true ;
2898
2940
}
2899
2941
2900
- // / This plugin does not support exchanging data between two devices.
2901
2942
bool isDataExchangable (int32_t SrcDeviceId, int32_t DstDeviceId) override {
2902
- return false ;
2943
+ return true ;
2903
2944
}
2904
2945
2905
2946
// / Get the host device instance.
@@ -3174,8 +3215,10 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
3174
3215
return nullptr ;
3175
3216
}
3176
3217
3177
- if (Alloc && (Kind == TARGET_ALLOC_HOST || Kind == TARGET_ALLOC_SHARED) ) {
3218
+ if (Alloc) {
3178
3219
auto &KernelAgents = Plugin::get<AMDGPUPluginTy>().getKernelAgents ();
3220
+ // Inherently necessary for host or shared allocations
3221
+ // Also enabled for device memory to allow device to device memcpy
3179
3222
3180
3223
// Enable all kernel agents to access the buffer.
3181
3224
if (auto Err = MemoryPool->enableAccess (Alloc, Size, KernelAgents)) {
0 commit comments