@@ -621,9 +621,9 @@ struct AMDGPUSignalTy {
621
621
}
622
622
623
623
// / Wait until the signal gets a zero value.
624
- Error wait (const uint64_t ActiveTimeout = 0 , RPCServerTy *RPCServer = nullptr ,
624
+ Error wait (const uint64_t ActiveTimeout = 0 ,
625
625
GenericDeviceTy *Device = nullptr ) const {
626
- if (ActiveTimeout && !RPCServer ) {
626
+ if (ActiveTimeout) {
627
627
hsa_signal_value_t Got = 1 ;
628
628
Got = hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
629
629
ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
@@ -632,14 +632,11 @@ struct AMDGPUSignalTy {
632
632
}
633
633
634
634
// If there is an RPC device attached to this stream we run it as a server.
635
- uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
636
- auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
635
+ uint64_t Timeout = UINT64_MAX;
636
+ auto WaitState = HSA_WAIT_STATE_BLOCKED;
637
637
while (hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
638
- Timeout, WaitState) != 0 ) {
639
- if (RPCServer && Device)
640
- if (auto Err = RPCServer->runServer (*Device))
641
- return Err;
642
- }
638
+ Timeout, WaitState) != 0 )
639
+ ;
643
640
return Plugin::success ();
644
641
}
645
642
@@ -1048,11 +1045,6 @@ struct AMDGPUStreamTy {
1048
1045
// / operation that was already finalized in a previous stream sycnhronize.
1049
1046
uint32_t SyncCycle;
1050
1047
1051
- // / A pointer associated with an RPC server running on the given device. If
1052
- // / RPC is not being used this will be a null pointer. Otherwise, this
1053
- // / indicates that an RPC server is expected to be run on this stream.
1054
- RPCServerTy *RPCServer;
1055
-
1056
1048
// / Mutex to protect stream's management.
1057
1049
mutable std::mutex Mutex;
1058
1050
@@ -1232,9 +1224,6 @@ struct AMDGPUStreamTy {
1232
1224
// / Deinitialize the stream's signals.
1233
1225
Error deinit () { return Plugin::success (); }
1234
1226
1235
- // / Attach an RPC server to this stream.
1236
- void setRPCServer (RPCServerTy *Server) { RPCServer = Server; }
1237
-
1238
1227
// / Push a asynchronous kernel to the stream. The kernel arguments must be
1239
1228
// / placed in a special allocation for kernel args and must keep alive until
1240
1229
// / the kernel finalizes. Once the kernel is finished, the stream will release
@@ -1262,10 +1251,30 @@ struct AMDGPUStreamTy {
1262
1251
if (auto Err = Slots[Curr].schedReleaseBuffer (KernelArgs, MemoryManager))
1263
1252
return Err;
1264
1253
1254
+ // If we are running an RPC server we want to wake up the server thread
1255
+ // whenever there is a kernel running and let it sleep otherwise.
1256
+ if (Device.getRPCServer ())
1257
+ Device.Plugin .getRPCServer ().Thread ->notify ();
1258
+
1265
1259
// Push the kernel with the output signal and an input signal (optional)
1266
- return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
1267
- GroupSize, StackSize, OutputSignal,
1268
- InputSignal);
1260
+ if (auto Err = Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads,
1261
+ NumBlocks, GroupSize, StackSize,
1262
+ OutputSignal, InputSignal))
1263
+ return Err;
1264
+
1265
+ // Register a callback to indicate when the kernel is complete.
1266
+ if (Device.getRPCServer ()) {
1267
+ if (auto Err = Slots[Curr].schedCallback (
1268
+ [](void *Data) -> llvm::Error {
1269
+ GenericPluginTy &Plugin =
1270
+ *reinterpret_cast <GenericPluginTy *>(Data);
1271
+ Plugin.getRPCServer ().Thread ->finish ();
1272
+ return Error::success ();
1273
+ },
1274
+ &Device.Plugin ))
1275
+ return Err;
1276
+ }
1277
+ return Plugin::success ();
1269
1278
}
1270
1279
1271
1280
// / Push an asynchronous memory copy between pinned memory buffers.
@@ -1475,8 +1484,8 @@ struct AMDGPUStreamTy {
1475
1484
return Plugin::success ();
1476
1485
1477
1486
// Wait until all previous operations on the stream have completed.
1478
- if (auto Err = Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds,
1479
- RPCServer , &Device))
1487
+ if (auto Err =
1488
+ Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds , &Device))
1480
1489
return Err;
1481
1490
1482
1491
// Reset the stream and perform all pending post actions.
@@ -3025,7 +3034,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
3025
3034
: Agent(Device.getAgent()), Queue(nullptr ),
3026
3035
SignalManager (Device.getSignalManager()), Device(Device),
3027
3036
// Initialize the std::deque with some empty positions.
3028
- Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer( nullptr ),
3037
+ Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
3029
3038
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
3030
3039
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
3031
3040
@@ -3378,10 +3387,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3378
3387
if (auto Err = AMDGPUDevice.getStream (AsyncInfoWrapper, Stream))
3379
3388
return Err;
3380
3389
3381
- // If this kernel requires an RPC server we attach its pointer to the stream.
3382
- if (GenericDevice.getRPCServer ())
3383
- Stream->setRPCServer (GenericDevice.getRPCServer ());
3384
-
3385
3390
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
3386
3391
if (ImplArgs &&
3387
3392
getImplicitArgsSize () == sizeof (hsa_utils::AMDGPUImplicitArgsTy)) {
0 commit comments