Fix host task mem migration for CUDA and HIP

Hugh Delaney · Hugh Delaney · commit def7c42adbc6 · 2024-07-01T17:07:41.000+01:00
The SYCL RT assumes that for devices in the same context, no mem
migration needs to occur across devices for a kernel launch or host
task. However, a CUdeviceptr is relevant to a specific device, so mem
migration must occur between devices in a ctx. If this assumption that
the SYCL RT makes about native mems being accessible to all devices in a
context, it must hand off the HT lambda to the plugin, so that the
plugin can handle the necessary mem migration.

This patch uses the new urEnqueueCustomCommandExp to execute the HT
lambda, which takes care of mem migration implicitly in the plugin.
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
@@ -317,9 +317,23 @@ static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
   }
 }
 
+namespace {
+
+struct EnqueueNativeCommandData {
+  sycl::interop_handle ih;
+  std::function<void(interop_handle)> func;
+};
+
+void InteropFreeFunc(pi_queue InteropQueue, void *InteropData) {
+  auto *Data = reinterpret_cast<EnqueueNativeCommandData *>(InteropData);
+  return Data->func(Data->ih);
+};
+} // namespace
+
 class DispatchHostTask {
   ExecCGCommand *MThisCmd;
   std::vector<interop_handle::ReqToMem> MReqToMem;
+  std::vector<pi_mem> MReqPiMem;
 
   bool waitForEvents() const {
     std::map<const PluginPtr, std::vector<EventImplPtr>>
@@ -365,8 +379,10 @@ class DispatchHostTask {
 
 public:
   DispatchHostTask(ExecCGCommand *ThisCmd,
-                   std::vector<interop_handle::ReqToMem> ReqToMem)
-      : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
+                   std::vector<interop_handle::ReqToMem> ReqToMem,
+                   std::vector<pi_mem> ReqPiMem)
+      : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)),
+        MReqPiMem(std::move(ReqPiMem)) {}
 
   void operator()() const {
     assert(MThisCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask);
@@ -402,8 +418,27 @@ class DispatchHostTask {
         interop_handle IH{MReqToMem, HostTask.MQueue,
                           HostTask.MQueue->getDeviceImplPtr(),
                           HostTask.MQueue->getContextImplPtr()};
-
-        HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH);
+        if (IH.get_backend() == backend::ext_oneapi_cuda ||
+            IH.get_backend() == backend::ext_oneapi_hip) {
+          EnqueueNativeCommandData CustomOpData{
+              IH, HostTask.MHostTask->MInteropTask};
+
+          // We are assuming that we have already synchronized with the HT's
+          // dependent events, and that the user will synchronize before the end
+          // of the HT lambda. As such we don't pass in any events, or ask for
+          // one back.
+          //
+          // This entry point is needed in order to migrate memory across
+          // devices in the same context for CUDA and HIP backends
+          HostTask.MQueue->getPlugin()
+              ->call<PiApiKind::piextEnqueueNativeCommand>(
+                  HostTask.MQueue->getHandleRef(), InteropFreeFunc,
+                  &CustomOpData, MReqPiMem.size(), MReqPiMem.data(),
+                  0, nullptr, nullptr);
+        } else {
+          HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(),
+                                   IH);
+        }
       } else
         HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo());
     } catch (...) {
@@ -3121,13 +3156,14 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     }
 
     std::vector<interop_handle::ReqToMem> ReqToMem;
+    std::vector<pi_mem> ReqPiMem;
 
     if (HostTask->MHostTask->isInteropTask()) {
       // Extract the Mem Objects for all Requirements, to ensure they are
       // available if a user asks for them inside the interop task scope
       const std::vector<Requirement *> &HandlerReq =
           HostTask->getRequirements();
-      auto ReqToMemConv = [&ReqToMem, HostTask](Requirement *Req) {
+      auto ReqToMemConv = [&ReqToMem, &ReqPiMem, HostTask](Requirement *Req) {
         const std::vector<AllocaCommandBase *> &AllocaCmds =
             Req->MSYCLMemObj->MRecord->MAllocaCommands;
 
@@ -3137,6 +3173,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
             auto MemArg =
                 reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
             ReqToMem.emplace_back(std::make_pair(Req, MemArg));
+            ReqPiMem.emplace_back(MemArg);
 
             return;
           }
@@ -3158,7 +3195,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     copySubmissionCodeLocation();
 
     MQueue->getThreadPool().submit<DispatchHostTask>(
-        DispatchHostTask(this, std::move(ReqToMem)));
+        DispatchHostTask(this, std::move(ReqToMem)), std::move(ReqPiMem));
 
     MShouldCompleteEventIfPossible = false;
 
diff --git a/sycl/test-e2e/HostInteropTask/interop-task-cuda-buffer-migrate.cpp b/sycl/test-e2e/HostInteropTask/interop-task-cuda-buffer-migrate.cpp
@@ -1,7 +1,4 @@
 // REQUIRES: cuda
-// XFAIL: cuda
-//
-// FIXME: this is broken with a multi device context
 //
 // RUN: %{build} -o %t.out -lcuda
 // RUN: %{run} %t.out