ROCm
diff --git a/‎offload/plugins-nextgen/amdgpu/src/rtl.cpp
Lines changed: 34 additions & 29 deletions b/‎offload/plugins-nextgen/amdgpu/src/rtl.cpp
Lines changed: 34 additions & 29 deletions
diff --git a/‎offload/plugins-nextgen/common/include/PluginInterface.h
Lines changed: 0 additions & 6 deletions b/‎offload/plugins-nextgen/common/include/PluginInterface.h
Lines changed: 0 additions & 6 deletions
diff --git a/‎offload/plugins-nextgen/common/include/RPC.h
Lines changed: 75 additions & 5 deletions b/‎offload/plugins-nextgen/common/include/RPC.h
Lines changed: 75 additions & 5 deletions
diff --git a/‎offload/plugins-nextgen/common/src/PluginInterface.cpp
Lines changed: 4 additions & 29 deletions b/‎offload/plugins-nextgen/common/src/PluginInterface.cpp
Lines changed: 4 additions & 29 deletions
@@ -1382,9 +1382,9 @@ struct AMDGPUSignalTy {
   }
 
   /// Wait until the signal gets a zero value.
-  Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr,
+  Error wait(const uint64_t ActiveTimeout = 0,
              GenericDeviceTy *Device = nullptr) const {
-    if (ActiveTimeout && !RPCServer) {
+    if (ActiveTimeout) {
       hsa_signal_value_t Got = 1;
       Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
                                       ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
@@ -1393,14 +1393,11 @@ struct AMDGPUSignalTy {
     }
 
     // If there is an RPC device attached to this stream we run it as a server.
-    uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
-    auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
+    uint64_t Timeout = UINT64_MAX;
+    auto WaitState = HSA_WAIT_STATE_BLOCKED;
     while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                     Timeout, WaitState) != 0) {
-      if (RPCServer && Device)
-        if (auto Err = RPCServer->runServer(*Device))
-          return Err;
-    }
+                                     Timeout, WaitState) != 0)
+      ;
     return Plugin::success();
   }
 
@@ -1895,11 +1892,6 @@ struct AMDGPUStreamTy {
   /// operation that was already finalized in a previous stream sycnhronize.
   uint32_t SyncCycle;
 
-  /// A pointer associated with an RPC server running on the given device. If
-  /// RPC is not being used this will be a null pointer. Otherwise, this
-  /// indicates that an RPC server is expected to be run on this stream.
-  RPCServerTy *RPCServer;
-
   /// Mutex to protect stream's management.
   mutable std::mutex Mutex;
 
@@ -2136,9 +2128,6 @@ struct AMDGPUStreamTy {
 
   hsa_queue_t *getHsaQueue() { return Queue->getHsaQueue(); }
 
-  /// Attach an RPC server to this stream.
-  void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
-
   /// Push a asynchronous kernel to the stream. The kernel arguments must be
   /// placed in a special allocation for kernel args and must keep alive until
   /// the kernel finalizes. Once the kernel is finished, the stream will release
@@ -2194,9 +2183,30 @@ struct AMDGPUStreamTy {
 
     // Push the kernel with the output signal and an input signal (optional)
     DP("Using Queue: %p with HSA Queue: %p\n", Queue, Queue->getHsaQueue());
-    return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
-                                   GroupSize, StackSize, OutputSignal,
-                                   InputSignal);
+    // If we are running an RPC server we want to wake up the server thread
+    // whenever there is a kernel running and let it sleep otherwise.
+    if (Device.getRPCServer())
+      Device.Plugin.getRPCServer().Thread->notify();
+
+    // Push the kernel with the output signal and an input signal (optional)
+    if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads,
+                                           NumBlocks, GroupSize, StackSize,
+                                           OutputSignal, InputSignal))
+      return Err;
+
+    // Register a callback to indicate when the kernel is complete.
+    if (Device.getRPCServer()) {
+      if (auto Err = Slots[Curr].schedCallback(
+              [](void *Data) -> llvm::Error {
+                GenericPluginTy &Plugin =
+                    *reinterpret_cast<GenericPluginTy *>(Data);
+                Plugin.getRPCServer().Thread->finish();
+                return Error::success();
+              },
+              &Device.Plugin))
+        return Err;
+    }
+    return Plugin::success();
   }
 
   /// Push an asynchronous memory copy between pinned memory buffers.
@@ -2268,9 +2278,8 @@ struct AMDGPUStreamTy {
 
     // Wait for kernel to finish before scheduling the asynchronous copy.
     if (UseSyncCopyBack && InputSignal && InputSignal->load())
-      if (auto Err = InputSignal->wait(StreamBusyWaitMicroseconds, RPCServer, &Device))
+      if (auto Err = InputSignal->wait(StreamBusyWaitMicroseconds, &Device))
         return Err;
-
 #ifdef OMPT_SUPPORT
 
     if (OmptInfo) {
@@ -2457,8 +2466,8 @@ struct AMDGPUStreamTy {
       return Plugin::success();
 
     // Wait until all previous operations on the stream have completed.
-    if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds,
-                                              RPCServer, &Device))
+    if (auto Err =
+            Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device))
       return Err;
 
     // Reset the stream and perform all pending post actions.
@@ -4701,7 +4710,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
     : Agent(Device.getAgent()), Queue(nullptr),
       SignalManager(Device.getSignalManager()), Device(Device),
       // Initialize the std::deque with some empty positions.
-      Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
+      Slots(32), NextSlot(0), SyncCycle(0),
       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
       UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
       UseSyncCopyBack(Device.syncCopyBack()) {}
@@ -5117,10 +5126,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
     DP("No hostrpc buffer or service thread required\n");
   }
 
-  // If this kernel requires an RPC server we attach its pointer to the stream.
-  if (GenericDevice.getRPCServer())
-    Stream->setRPCServer(GenericDevice.getRPCServer());
-
   // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
   if (ImplArgs &&
       getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
 
@@ -797,12 +797,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Setup the global device memory pool, if the plugin requires one.
   Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
                               uint64_t PoolSize);
-
-  // Setup the RPC server for this device if needed. This may not run on some
-  // plugins like the CPU targets. By default, it will not be executed so it is
-  // up to the target to override this using the shouldSetupRPCServer function.
-  Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
-
   /// Synchronize the current thread with the pending operations on the
   /// __tgt_async_info structure.
   Error synchronize(__tgt_async_info *AsyncInfo);
 
@@ -19,7 +19,11 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
 
+#include <atomic>
+#include <condition_variable>
 #include <cstdint>
+#include <mutex>
+#include <thread>
 
 namespace llvm::omp::target {
 namespace plugin {
@@ -37,6 +41,12 @@ struct RPCServerTy {
   /// Initializes the handles to the number of devices we may need to service.
   RPCServerTy(plugin::GenericPluginTy &Plugin);
 
+  /// Deinitialize the associated memory and resources.
+  llvm::Error shutDown();
+
+  /// Initialize the worker thread.
+  llvm::Error startThread();
+
   /// Check if this device image is using an RPC server. This checks for the
   /// precense of an externally visible symbol in the device image that will
   /// be present whenever RPC code is called.
@@ -51,17 +61,77 @@ struct RPCServerTy {
                          plugin::GenericGlobalHandlerTy &Handler,
                          plugin::DeviceImageTy &Image);
 
-  /// Runs the RPC server associated with the \p Device until the pending work
-  /// is cleared.
-  llvm::Error runServer(plugin::GenericDeviceTy &Device);
-
   /// Deinitialize the RPC server for the given device. This will free the
   /// memory associated with the k
   llvm::Error deinitDevice(plugin::GenericDeviceTy &Device);
 
 private:
   /// Array from this device's identifier to its attached devices.
-  llvm::SmallVector<void *> Buffers;
+  std::unique_ptr<void *[]> Buffers;
+
+  /// Array of associated devices. These must be alive as long as the server is.
+  std::unique_ptr<plugin::GenericDeviceTy *[]> Devices;
+
+  /// A helper class for running the user thread that handles the RPC interface.
+  /// Because we only need to check the RPC server while any kernels are
+  /// working, we track submission / completion events to allow the thread to
+  /// sleep when it is not needed.
+  struct ServerThread {
+    std::thread Worker;
+
+    /// A boolean indicating whether or not the worker thread should continue.
+    std::atomic<bool> Running;
+
+    /// The number of currently executing kernels across all devices that need
+    /// the server thread to be running.
+    std::atomic<uint32_t> NumUsers;
+
+    /// The condition variable used to suspend the thread if no work is needed.
+    std::condition_variable CV;
+    std::mutex Mutex;
+
+    /// A reference to all the RPC interfaces that the server is handling.
+    llvm::ArrayRef<void *> Buffers;
+
+    /// A reference to the associated generic device for the buffer.
+    llvm::ArrayRef<plugin::GenericDeviceTy *> Devices;
+
+    /// Initialize the worker thread to run in the background.
+    ServerThread(void *Buffers[], plugin::GenericDeviceTy *Devices[],
+                 size_t Length)
+        : Running(true), NumUsers(0), CV(), Mutex(), Buffers(Buffers, Length),
+          Devices(Devices, Length) {}
+
+    ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); }
+
+    /// Notify the worker thread that there is a user that needs it.
+    void notify() {
+      std::lock_guard<decltype(Mutex)> Lock(Mutex);
+      NumUsers.fetch_add(1, std::memory_order_relaxed);
+      CV.notify_all();
+    }
+
+    /// Indicate that one of the dependent users has finished.
+    void finish() {
+      [[maybe_unused]] uint32_t Old =
+          NumUsers.fetch_sub(1, std::memory_order_relaxed);
+      assert(Old > 0 && "Attempt to signal finish with no pending work");
+    }
+
+    /// Destroy the worker thread and wait.
+    void shutDown();
+
+    /// Initialize the worker thread.
+    void startThread();
+
+    /// Run the server thread to continuously check the RPC interface for work
+    /// to be done for the device.
+    void run();
+  };
+
+public:
+  /// Pointer to the server thread instance.
+  std::unique_ptr<ServerThread> Thread;
 };
 
 } // namespace llvm::omp::target
 
@@ -1132,10 +1132,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
     } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
       return std::move(Err);
   }
-
-  if (auto Err = setupRPCServer(Plugin, *Image))
-    return std::move(Err);
-
 #ifdef OMPT_SUPPORT
   if (ompt::Initialized) {
     size_t Bytes =
@@ -1249,30 +1245,6 @@ Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
   return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
 }
 
-Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
-                                      DeviceImageTy &Image) {
-  // The plugin either does not need an RPC server or it is unavailible.
-  if (!shouldSetupRPCServer())
-    return Plugin::success();
-
-  // Check if this device needs to run an RPC server.
-  RPCServerTy &Server = Plugin.getRPCServer();
-  auto UsingOrErr =
-      Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image);
-  if (!UsingOrErr)
-    return UsingOrErr.takeError();
-
-  if (!UsingOrErr.get())
-    return Plugin::success();
-
-  if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
-    return Err;
-
-  RPCServer = &Server;
-  DP("Running an RPC server on device %d\n", getDeviceId());
-  return Plugin::success();
-}
-
 Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
                                          size_t Size, bool ExternallyLocked) {
   // Insert the new entry into the map.
@@ -1892,8 +1864,11 @@ Error GenericPluginTy::deinit() {
     delete GlobalHandler;
 
 #if RPC_FIXME
-  if (RPCServer)
+  if (RPCServer) {
+    if (Error Err = RPCServer->shutDown())
+      return Err;
     delete RPCServer;
+  }
 #endif
 
   if (RecordReplay)