@@ -1382,9 +1382,9 @@ struct AMDGPUSignalTy {
1382
1382
}
1383
1383
1384
1384
// / Wait until the signal gets a zero value.
1385
- Error wait (const uint64_t ActiveTimeout = 0 , RPCServerTy *RPCServer = nullptr ,
1385
+ Error wait (const uint64_t ActiveTimeout = 0 ,
1386
1386
GenericDeviceTy *Device = nullptr ) const {
1387
- if (ActiveTimeout && !RPCServer ) {
1387
+ if (ActiveTimeout) {
1388
1388
hsa_signal_value_t Got = 1 ;
1389
1389
Got = hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
1390
1390
ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
@@ -1393,14 +1393,11 @@ struct AMDGPUSignalTy {
1393
1393
}
1394
1394
1395
1395
// If there is an RPC device attached to this stream we run it as a server.
1396
- uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
1397
- auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
1396
+ uint64_t Timeout = UINT64_MAX;
1397
+ auto WaitState = HSA_WAIT_STATE_BLOCKED;
1398
1398
while (hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
1399
- Timeout, WaitState) != 0 ) {
1400
- if (RPCServer && Device)
1401
- if (auto Err = RPCServer->runServer (*Device))
1402
- return Err;
1403
- }
1399
+ Timeout, WaitState) != 0 )
1400
+ ;
1404
1401
return Plugin::success ();
1405
1402
}
1406
1403
@@ -1895,11 +1892,6 @@ struct AMDGPUStreamTy {
1895
1892
// / operation that was already finalized in a previous stream sycnhronize.
1896
1893
uint32_t SyncCycle;
1897
1894
1898
- // / A pointer associated with an RPC server running on the given device. If
1899
- // / RPC is not being used this will be a null pointer. Otherwise, this
1900
- // / indicates that an RPC server is expected to be run on this stream.
1901
- RPCServerTy *RPCServer;
1902
-
1903
1895
// / Mutex to protect stream's management.
1904
1896
mutable std::mutex Mutex;
1905
1897
@@ -2136,9 +2128,6 @@ struct AMDGPUStreamTy {
2136
2128
2137
2129
hsa_queue_t *getHsaQueue () { return Queue->getHsaQueue (); }
2138
2130
2139
- // / Attach an RPC server to this stream.
2140
- void setRPCServer (RPCServerTy *Server) { RPCServer = Server; }
2141
-
2142
2131
// / Push a asynchronous kernel to the stream. The kernel arguments must be
2143
2132
// / placed in a special allocation for kernel args and must keep alive until
2144
2133
// / the kernel finalizes. Once the kernel is finished, the stream will release
@@ -2194,9 +2183,30 @@ struct AMDGPUStreamTy {
2194
2183
2195
2184
// Push the kernel with the output signal and an input signal (optional)
2196
2185
DP (" Using Queue: %p with HSA Queue: %p\n " , Queue, Queue->getHsaQueue ());
2197
- return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
2198
- GroupSize, StackSize, OutputSignal,
2199
- InputSignal);
2186
+ // If we are running an RPC server we want to wake up the server thread
2187
+ // whenever there is a kernel running and let it sleep otherwise.
2188
+ if (Device.getRPCServer ())
2189
+ Device.Plugin .getRPCServer ().Thread ->notify ();
2190
+
2191
+ // Push the kernel with the output signal and an input signal (optional)
2192
+ if (auto Err = Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads,
2193
+ NumBlocks, GroupSize, StackSize,
2194
+ OutputSignal, InputSignal))
2195
+ return Err;
2196
+
2197
+ // Register a callback to indicate when the kernel is complete.
2198
+ if (Device.getRPCServer ()) {
2199
+ if (auto Err = Slots[Curr].schedCallback (
2200
+ [](void *Data) -> llvm::Error {
2201
+ GenericPluginTy &Plugin =
2202
+ *reinterpret_cast <GenericPluginTy *>(Data);
2203
+ Plugin.getRPCServer ().Thread ->finish ();
2204
+ return Error::success ();
2205
+ },
2206
+ &Device.Plugin ))
2207
+ return Err;
2208
+ }
2209
+ return Plugin::success ();
2200
2210
}
2201
2211
2202
2212
// / Push an asynchronous memory copy between pinned memory buffers.
@@ -2268,9 +2278,8 @@ struct AMDGPUStreamTy {
2268
2278
2269
2279
// Wait for kernel to finish before scheduling the asynchronous copy.
2270
2280
if (UseSyncCopyBack && InputSignal && InputSignal->load ())
2271
- if (auto Err = InputSignal->wait (StreamBusyWaitMicroseconds, RPCServer, &Device))
2281
+ if (auto Err = InputSignal->wait (StreamBusyWaitMicroseconds, &Device))
2272
2282
return Err;
2273
-
2274
2283
#ifdef OMPT_SUPPORT
2275
2284
2276
2285
if (OmptInfo) {
@@ -2457,8 +2466,8 @@ struct AMDGPUStreamTy {
2457
2466
return Plugin::success ();
2458
2467
2459
2468
// Wait until all previous operations on the stream have completed.
2460
- if (auto Err = Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds,
2461
- RPCServer , &Device))
2469
+ if (auto Err =
2470
+ Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds , &Device))
2462
2471
return Err;
2463
2472
2464
2473
// Reset the stream and perform all pending post actions.
@@ -4701,7 +4710,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
4701
4710
: Agent(Device.getAgent()), Queue(nullptr ),
4702
4711
SignalManager (Device.getSignalManager()), Device(Device),
4703
4712
// Initialize the std::deque with some empty positions.
4704
- Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer( nullptr ),
4713
+ Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
4705
4714
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
4706
4715
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
4707
4716
UseSyncCopyBack(Device.syncCopyBack()) {}
@@ -5117,10 +5126,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
5117
5126
DP (" No hostrpc buffer or service thread required\n " );
5118
5127
}
5119
5128
5120
- // If this kernel requires an RPC server we attach its pointer to the stream.
5121
- if (GenericDevice.getRPCServer ())
5122
- Stream->setRPCServer (GenericDevice.getRPCServer ());
5123
-
5124
5129
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
5125
5130
if (ImplArgs &&
5126
5131
getImplicitArgsSize () == sizeof (hsa_utils::AMDGPUImplicitArgsTy)) {
0 commit comments