[libc] Check the RPC server once again after the kernel exits

jhuber6 · jhuber6 · commit 182e5acb1172 · 2023-05-12T12:49:19.000-05:00
We support asynchronous sends, that means that the kernel can issue a send, then exit the kernel as we do with the `EXIT` syscall. Because of the condition it's therefore possible for the kernel to exit and break from the loop before we check the server again. This can potentially cause us to ignore an `EXIT` call from the GPU. Reviewed By: JonChesterfield, lntue Differential Revision: https://reviews.llvm.org/D150456
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -221,6 +221,10 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
              /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
     handle_server();
 
+  // Handle the server one more time in case the kernel exited with a pending
+  // send still in flight.
+  handle_server();
+
   // Destroy the resources acquired to launch the kernel and return.
   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
     handle_error(err);
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -186,6 +186,10 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
   while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
     handle_server();
 
+  // Handle the server one more time in case the kernel exited with a pending
+  // send still in flight.
+  handle_server();
+
   return CUDA_SUCCESS;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -186,6 +186,10 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,`
`186`	`186`	`while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)`
`187`	`187`	`handle_server();`
`188`	`188`
	`189`	`+ // Handle the server one more time in case the kernel exited with a pending`
	`190`	`+ // send still in flight.`
	`191`	`+ handle_server();`
	`192`	`+`
`189`	`193`	`return CUDA_SUCCESS;`
`190`	`194`	`}`
`191`	`195`