Skip to content

Commit 6d1a513

Browse files
authored
[libc][Docs] Update libc documentation for RPC and others (#120018)
Summary: A few of these were out of date, update them now that the C library interface into RPC was deleted.
1 parent 9cb68b4 commit 6d1a513

File tree

2 files changed

+33
-114
lines changed

2 files changed

+33
-114
lines changed

libc/docs/gpu/rpc.rst

Lines changed: 32 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -92,20 +92,6 @@ asynchronous operations that do not need to wait until the server has completed
9292
them. If an operation requires more data than the fixed size buffer, we simply
9393
send multiple packets back and forth in a streaming fashion.
9494

95-
Server Library
96-
--------------
97-
98-
The RPC server's basic functionality is provided by the LLVM C library. A static
99-
library called ``libllvmlibc_rpc_server.a`` includes handling for the basic
100-
operations, such as printing or exiting. This has a small API that handles
101-
setting up the unified buffer and an interface to check the opcodes.
102-
103-
Some operations are too divergent to provide generic implementations for, such
104-
as allocating device accessible memory. For these cases, we provide a callback
105-
registration scheme to add a custom handler for any given opcode through the
106-
port API. More information can be found in the installed header
107-
``<install>/include/llvmlibc_rpc_server.h``.
108-
10995
Client Example
11096
--------------
11197

@@ -183,7 +169,7 @@ CUDA Server Example
183169

184170
The following code shows an example of using the exported RPC interface along
185171
with the C library to manually configure a working server using the CUDA
186-
language. Other runtimes can use the presence of the ``__llvm_libc_rpc_client``
172+
language. Other runtimes can use the presence of the ``__llvm_rpc_client``
187173
in the GPU executable as an indicator for whether or not the server can be
188174
checked. These details should ideally be handled by the GPU language runtime,
189175
but the following example shows how it can be used by a standard user.
@@ -196,53 +182,16 @@ but the following example shows how it can be used by a standard user.
196182
#include <cstdlib>
197183
#include <cuda_runtime.h>
198184
199-
#include <llvmlibc_rpc_server.h>
185+
#include <shared/rpc.h>
186+
#include <shared/rpc_opcodes.h>
200187
201188
[[noreturn]] void handle_error(cudaError_t err) {
202189
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
203190
exit(EXIT_FAILURE);
204191
}
205192
206-
[[noreturn]] void handle_error(rpc_status_t err) {
207-
fprintf(stderr, "RPC error: %d\n", err);
208-
exit(EXIT_FAILURE);
209-
}
210-
211-
// The handle to the RPC client provided by the C library.
212-
extern "C" __device__ void *__llvm_libc_rpc_client;
213-
214-
__global__ void get_client_ptr(void **ptr) { *ptr = __llvm_libc_rpc_client; }
215-
216-
// Obtain the RPC client's handle from the device. The CUDA language cannot look
217-
// up the symbol directly like the driver API, so we launch a kernel to read it.
218-
void *get_rpc_client() {
219-
void *rpc_client = nullptr;
220-
void **rpc_client_d = nullptr;
221-
222-
if (cudaError_t err = cudaMalloc(&rpc_client_d, sizeof(void *)))
223-
handle_error(err);
224-
get_client_ptr<<<1, 1>>>(rpc_client_d);
225-
if (cudaError_t err = cudaDeviceSynchronize())
226-
handle_error(err);
227-
if (cudaError_t err = cudaMemcpy(&rpc_client, rpc_client_d, sizeof(void *),
228-
cudaMemcpyDeviceToHost))
229-
handle_error(err);
230-
return rpc_client;
231-
}
232-
233-
// Routines to allocate mapped memory that both the host and the device can
234-
// access asychonrously to communicate with each other.
235-
void *alloc_host(size_t size, void *) {
236-
void *sharable_ptr;
237-
if (cudaError_t err = cudaMallocHost(&sharable_ptr, sizeof(void *)))
238-
handle_error(err);
239-
return sharable_ptr;
240-
};
241-
242-
void free_host(void *ptr, void *) {
243-
if (cudaError_t err = cudaFreeHost(ptr))
244-
handle_error(err);
245-
}
193+
// Routes the library symbol into the CUDA runtime interface.
194+
[[gnu::weak]] __device__ rpc::Client client asm("__llvm_rpc_client");
246195
247196
// The device-side overload of the standard C function to call.
248197
extern "C" __device__ int puts(const char *);
@@ -251,18 +200,23 @@ but the following example shows how it can be used by a standard user.
251200
__global__ void hello() { puts("Hello world!"); }
252201
253202
int main() {
254-
// Initialize the RPC server to run on the given device.
255-
rpc_device_t device;
256-
if (rpc_status_t err =
257-
rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
258-
/*warp_size=*/32, alloc_host, /*data=*/nullptr))
203+
void *rpc_client = nullptr;
204+
if (cudaError_t err = cudaGetSymbolAddress(&rpc_client, client))
205+
handle_error(err);
206+
207+
// Initialize the RPC client and server interface.
208+
uint32_t warp_size = 32;
209+
void *rpc_buffer = nullptr;
210+
if (cudaError_t err = cudaMallocHost(
211+
&rpc_buffer,
212+
rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
259213
handle_error(err);
214+
rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
215+
rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
260216
261-
// Initialize the RPC client by copying the buffer to the device's handle.
262-
void *rpc_client = get_rpc_client();
263-
if (cudaError_t err =
264-
cudaMemcpy(rpc_client, rpc_get_client_buffer(device),
265-
rpc_get_client_size(), cudaMemcpyHostToDevice))
217+
// Initialize the client on the device so it can communicate with the server.
218+
if (cudaError_t err = cudaMemcpy(rpc_client, &client, sizeof(rpc::Client),
219+
cudaMemcpyHostToDevice))
266220
handle_error(err);
267221
268222
cudaStream_t stream;
@@ -274,28 +228,25 @@ but the following example shows how it can be used by a standard user.
274228
275229
// While the kernel is executing, check the RPC server for work to do.
276230
// Requires non-blocking CUDA kernels but avoids a separate thread.
277-
while (cudaStreamQuery(stream) == cudaErrorNotReady)
278-
if (rpc_status_t err = rpc_handle_server(device))
279-
handle_error(err);
280-
281-
// Shut down the server running on the given device.
282-
if (rpc_status_t err =
283-
rpc_server_shutdown(device, free_host, /*data=*/nullptr))
284-
handle_error(err);
285-
286-
return EXIT_SUCCESS;
231+
do {
232+
auto port = server.try_open(warp_size, /*index=*/0);
233+
// From libllvmlibc_rpc_server.a in the installation.
234+
if (port)
235+
handle_libc_opcodes(*port, warp_size);
236+
} while (cudaStreamQuery(stream) == cudaErrorNotReady);
287237
}
288238
289239
The above code must be compiled in CUDA's relocatable device code mode and with
290240
the advanced offloading driver to link in the library. Currently this can be
291241
done with the following invocation. Using LTO avoids the overhead normally
292-
associated with relocatable device code linking.
242+
associated with relocatable device code linking. The C library for GPUs is
243+
linked in by forwarding the static library to the device-side link job.
293244

294245
.. code-block:: sh
295246
296-
$> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart -lcgpu-nvptx \
247+
$> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart \
297248
-I<install-path>include -L<install-path>/lib -lllvmlibc_rpc_server \
298-
-O3 -foffload-lto -o hello
249+
-Xoffload-linker -lc -O3 -foffload-lto -o hello
299250
$> ./hello
300251
Hello world!
301252
@@ -304,4 +255,5 @@ Extensions
304255

305256
The opcode is a 32-bit integer that must be unique to the requested operation.
306257
All opcodes used by ``libc`` internally have the character ``c`` in the most
307-
significant byte.
258+
significant byte. Any other opcode is available for use outside of the ``libc``
259+
implementation.

libc/docs/gpu/using.rst

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -99,39 +99,6 @@ threads and two blocks.
9999
Including the wrapper headers, linking the C library, and running the :ref:`RPC
100100
server<libc_gpu_rpc>` are all handled automatically by the compiler and runtime.
101101

102-
Binary format
103-
^^^^^^^^^^^^^
104-
105-
The ``libcgpu.a`` static archive is a fat-binary containing LLVM-IR for each
106-
supported target device. The supported architectures can be seen using LLVM's
107-
``llvm-objdump`` with the ``--offloading`` flag:
108-
109-
.. code-block:: sh
110-
111-
$> llvm-objdump --offloading libcgpu-amdgpu.a
112-
libcgpu-amdgpu.a(strcmp.cpp.o): file format elf64-x86-64
113-
114-
OFFLOADING IMAGE [0]:
115-
kind llvm ir
116-
arch generic
117-
triple amdgcn-amd-amdhsa
118-
producer none
119-
...
120-
121-
Because the device code is stored inside a fat binary, it can be difficult to
122-
inspect the resulting code. This can be done using the following utilities:
123-
124-
.. code-block:: sh
125-
126-
$> llvm-ar x libcgpu.a strcmp.cpp.o
127-
$> clang-offload-packager strcmp.cpp.o --image=arch=generic,file=strcmp.bc
128-
$> opt -S out.bc
129-
...
130-
131-
Please note that this fat binary format is provided for compatibility with
132-
existing offloading toolchains. The implementation in ``libc`` does not depend
133-
on any existing offloading languages and is completely freestanding.
134-
135102
Direct compilation
136103
------------------
137104

@@ -246,7 +213,7 @@ compilation. Using link time optimization will help hide this.
246213

247214
.. code-block:: sh
248215
249-
$> clang hello.c --target=nvptx64-nvidia-cuda -mcpu=native -flto -lc <install>/lib/nvptx64-nvidia-cuda/crt1.o
216+
$> clang hello.c --target=nvptx64-nvidia-cuda -march=native -flto -lc <install>/lib/nvptx64-nvidia-cuda/crt1.o
250217
$> nvptx-loader --threads 2 --blocks 2 a.out
251218
Hello from NVPTX!
252219
Hello from NVPTX!

0 commit comments

Comments
 (0)