@@ -92,20 +92,6 @@ asynchronous operations that do not need to wait until the server has completed
92
92
them. If an operation requires more data than the fixed size buffer, we simply
93
93
send multiple packets back and forth in a streaming fashion.
94
94
95
- Server Library
96
- --------------
97
-
98
- The RPC server's basic functionality is provided by the LLVM C library. A static
99
- library called ``libllvmlibc_rpc_server.a `` includes handling for the basic
100
- operations, such as printing or exiting. This has a small API that handles
101
- setting up the unified buffer and an interface to check the opcodes.
102
-
103
- Some operations are too divergent to provide generic implementations for, such
104
- as allocating device accessible memory. For these cases, we provide a callback
105
- registration scheme to add a custom handler for any given opcode through the
106
- port API. More information can be found in the installed header
107
- ``<install>/include/llvmlibc_rpc_server.h ``.
108
-
109
95
Client Example
110
96
--------------
111
97
@@ -183,7 +169,7 @@ CUDA Server Example
183
169
184
170
The following code shows an example of using the exported RPC interface along
185
171
with the C library to manually configure a working server using the CUDA
186
- language. Other runtimes can use the presence of the ``__llvm_libc_rpc_client ``
172
+ language. Other runtimes can use the presence of the ``__llvm_rpc_client ``
187
173
in the GPU executable as an indicator for whether or not the server can be
188
174
checked. These details should ideally be handled by the GPU language runtime,
189
175
but the following example shows how it can be used by a standard user.
@@ -196,53 +182,16 @@ but the following example shows how it can be used by a standard user.
196
182
#include <cstdlib>
197
183
#include <cuda_runtime.h>
198
184
199
- #include <llvmlibc_rpc_server.h>
185
+ #include <shared/rpc.h>
186
+ #include <shared/rpc_opcodes.h>
200
187
201
188
[[noreturn]] void handle_error(cudaError_t err) {
202
189
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
203
190
exit(EXIT_FAILURE);
204
191
}
205
192
206
- [[noreturn]] void handle_error(rpc_status_t err) {
207
- fprintf(stderr, "RPC error: %d\n", err);
208
- exit(EXIT_FAILURE);
209
- }
210
-
211
- // The handle to the RPC client provided by the C library.
212
- extern "C" __device__ void *__llvm_libc_rpc_client;
213
-
214
- __global__ void get_client_ptr(void **ptr) { *ptr = __llvm_libc_rpc_client; }
215
-
216
- // Obtain the RPC client's handle from the device. The CUDA language cannot look
217
- // up the symbol directly like the driver API, so we launch a kernel to read it.
218
- void *get_rpc_client() {
219
- void *rpc_client = nullptr;
220
- void **rpc_client_d = nullptr;
221
-
222
- if (cudaError_t err = cudaMalloc(&rpc_client_d, sizeof(void *)))
223
- handle_error(err);
224
- get_client_ptr<<<1, 1>>>(rpc_client_d);
225
- if (cudaError_t err = cudaDeviceSynchronize())
226
- handle_error(err);
227
- if (cudaError_t err = cudaMemcpy(&rpc_client, rpc_client_d, sizeof(void *),
228
- cudaMemcpyDeviceToHost))
229
- handle_error(err);
230
- return rpc_client;
231
- }
232
-
233
- // Routines to allocate mapped memory that both the host and the device can
234
- // access asychonrously to communicate with each other.
235
- void *alloc_host(size_t size, void *) {
236
- void *sharable_ptr;
237
- if (cudaError_t err = cudaMallocHost(&sharable_ptr, sizeof(void *)))
238
- handle_error(err);
239
- return sharable_ptr;
240
- };
241
-
242
- void free_host(void *ptr, void *) {
243
- if (cudaError_t err = cudaFreeHost(ptr))
244
- handle_error(err);
245
- }
193
+ // Routes the library symbol into the CUDA runtime interface.
194
+ [[gnu::weak]] __device__ rpc::Client client asm("__llvm_rpc_client");
246
195
247
196
// The device-side overload of the standard C function to call.
248
197
extern "C" __device__ int puts(const char *);
@@ -251,18 +200,23 @@ but the following example shows how it can be used by a standard user.
251
200
__global__ void hello() { puts("Hello world!"); }
252
201
253
202
int main() {
254
- // Initialize the RPC server to run on the given device.
255
- rpc_device_t device;
256
- if (rpc_status_t err =
257
- rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
258
- /*warp_size=*/32, alloc_host, /*data=*/nullptr))
203
+ void *rpc_client = nullptr;
204
+ if (cudaError_t err = cudaGetSymbolAddress(&rpc_client, client))
205
+ handle_error(err);
206
+
207
+ // Initialize the RPC client and server interface.
208
+ uint32_t warp_size = 32;
209
+ void *rpc_buffer = nullptr;
210
+ if (cudaError_t err = cudaMallocHost(
211
+ &rpc_buffer,
212
+ rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
259
213
handle_error(err);
214
+ rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
215
+ rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
260
216
261
- // Initialize the RPC client by copying the buffer to the device's handle.
262
- void *rpc_client = get_rpc_client();
263
- if (cudaError_t err =
264
- cudaMemcpy(rpc_client, rpc_get_client_buffer(device),
265
- rpc_get_client_size(), cudaMemcpyHostToDevice))
217
+ // Initialize the client on the device so it can communicate with the server.
218
+ if (cudaError_t err = cudaMemcpy(rpc_client, &client, sizeof(rpc::Client),
219
+ cudaMemcpyHostToDevice))
266
220
handle_error(err);
267
221
268
222
cudaStream_t stream;
@@ -274,28 +228,25 @@ but the following example shows how it can be used by a standard user.
274
228
275
229
// While the kernel is executing, check the RPC server for work to do.
276
230
// Requires non-blocking CUDA kernels but avoids a separate thread.
277
- while (cudaStreamQuery(stream) == cudaErrorNotReady)
278
- if (rpc_status_t err = rpc_handle_server(device))
279
- handle_error(err);
280
-
281
- // Shut down the server running on the given device.
282
- if (rpc_status_t err =
283
- rpc_server_shutdown(device, free_host, /*data=*/nullptr))
284
- handle_error(err);
285
-
286
- return EXIT_SUCCESS;
231
+ do {
232
+ auto port = server.try_open(warp_size, /*index=*/0);
233
+ // From libllvmlibc_rpc_server.a in the installation.
234
+ if (port)
235
+ handle_libc_opcodes(*port, warp_size);
236
+ } while (cudaStreamQuery(stream) == cudaErrorNotReady);
287
237
}
288
238
289
239
The above code must be compiled in CUDA's relocatable device code mode and with
290
240
the advanced offloading driver to link in the library. Currently this can be
291
241
done with the following invocation. Using LTO avoids the overhead normally
292
- associated with relocatable device code linking.
242
+ associated with relocatable device code linking. The C library for GPUs is
243
+ linked in by forwarding the static library to the device-side link job.
293
244
294
245
.. code-block :: sh
295
246
296
- $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart -lcgpu-nvptx \
247
+ $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart \
297
248
-I< install-path> include -L< install-path> /lib -lllvmlibc_rpc_server \
298
- -O3 -foffload-lto -o hello
249
+ -Xoffload-linker -lc - O3 -foffload-lto -o hello
299
250
$> ./hello
300
251
Hello world!
301
252
@@ -304,4 +255,5 @@ Extensions
304
255
305
256
The opcode is a 32-bit integer that must be unique to the requested operation.
306
257
All opcodes used by ``libc `` internally have the character ``c `` in the most
307
- significant byte.
258
+ significant byte. Any other opcode is available for use outside of the ``libc ``
259
+ implementation.
0 commit comments