-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[libc][Docs] Update libc
documentation for RPC and others
#120018
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Summary: A few of these were out of date, update them now that the C library interface into RPC was deleted.
@llvm/pr-subscribers-libc Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/120018.diff 2 Files Affected:
diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst
index e1244154341e9f..0d169c7db9a50f 100644
--- a/libc/docs/gpu/rpc.rst
+++ b/libc/docs/gpu/rpc.rst
@@ -92,20 +92,6 @@ asynchronous operations that do not need to wait until the server has completed
them. If an operation requires more data than the fixed size buffer, we simply
send multiple packets back and forth in a streaming fashion.
-Server Library
---------------
-
-The RPC server's basic functionality is provided by the LLVM C library. A static
-library called ``libllvmlibc_rpc_server.a`` includes handling for the basic
-operations, such as printing or exiting. This has a small API that handles
-setting up the unified buffer and an interface to check the opcodes.
-
-Some operations are too divergent to provide generic implementations for, such
-as allocating device accessible memory. For these cases, we provide a callback
-registration scheme to add a custom handler for any given opcode through the
-port API. More information can be found in the installed header
-``<install>/include/llvmlibc_rpc_server.h``.
-
Client Example
--------------
@@ -183,7 +169,7 @@ CUDA Server Example
The following code shows an example of using the exported RPC interface along
with the C library to manually configure a working server using the CUDA
-language. Other runtimes can use the presence of the ``__llvm_libc_rpc_client``
+language. Other runtimes can use the presence of the ``__llvm_rpc_client``
in the GPU executable as an indicator for whether or not the server can be
checked. These details should ideally be handled by the GPU language runtime,
but the following example shows how it can be used by a standard user.
@@ -196,53 +182,16 @@ but the following example shows how it can be used by a standard user.
#include <cstdlib>
#include <cuda_runtime.h>
- #include <llvmlibc_rpc_server.h>
+ #include <shared/rpc.h>
+ #include <shared/rpc_opcodes.h>
[[noreturn]] void handle_error(cudaError_t err) {
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
- [[noreturn]] void handle_error(rpc_status_t err) {
- fprintf(stderr, "RPC error: %d\n", err);
- exit(EXIT_FAILURE);
- }
-
- // The handle to the RPC client provided by the C library.
- extern "C" __device__ void *__llvm_libc_rpc_client;
-
- __global__ void get_client_ptr(void **ptr) { *ptr = __llvm_libc_rpc_client; }
-
- // Obtain the RPC client's handle from the device. The CUDA language cannot look
- // up the symbol directly like the driver API, so we launch a kernel to read it.
- void *get_rpc_client() {
- void *rpc_client = nullptr;
- void **rpc_client_d = nullptr;
-
- if (cudaError_t err = cudaMalloc(&rpc_client_d, sizeof(void *)))
- handle_error(err);
- get_client_ptr<<<1, 1>>>(rpc_client_d);
- if (cudaError_t err = cudaDeviceSynchronize())
- handle_error(err);
- if (cudaError_t err = cudaMemcpy(&rpc_client, rpc_client_d, sizeof(void *),
- cudaMemcpyDeviceToHost))
- handle_error(err);
- return rpc_client;
- }
-
- // Routines to allocate mapped memory that both the host and the device can
- // access asychonrously to communicate with each other.
- void *alloc_host(size_t size, void *) {
- void *sharable_ptr;
- if (cudaError_t err = cudaMallocHost(&sharable_ptr, sizeof(void *)))
- handle_error(err);
- return sharable_ptr;
- };
-
- void free_host(void *ptr, void *) {
- if (cudaError_t err = cudaFreeHost(ptr))
- handle_error(err);
- }
+ // Routes the library symbol into the CUDA runtime interface.
+ [[gnu::weak]] __device__ rpc::Client client asm("__llvm_rpc_client");
// The device-side overload of the standard C function to call.
extern "C" __device__ int puts(const char *);
@@ -251,18 +200,23 @@ but the following example shows how it can be used by a standard user.
__global__ void hello() { puts("Hello world!"); }
int main() {
- // Initialize the RPC server to run on the given device.
- rpc_device_t device;
- if (rpc_status_t err =
- rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
- /*warp_size=*/32, alloc_host, /*data=*/nullptr))
+ void *rpc_client = nullptr;
+ if (cudaError_t err = cudaGetSymbolAddress(&rpc_client, client))
+ handle_error(err);
+
+ // Initialize the RPC client and server interface.
+ uint32_t warp_size = 32;
+ void *rpc_buffer = nullptr;
+ if (cudaError_t err = cudaMallocHost(
+ &rpc_buffer,
+ rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
handle_error(err);
+ rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
+ rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
- // Initialize the RPC client by copying the buffer to the device's handle.
- void *rpc_client = get_rpc_client();
- if (cudaError_t err =
- cudaMemcpy(rpc_client, rpc_get_client_buffer(device),
- rpc_get_client_size(), cudaMemcpyHostToDevice))
+ // Initialize the client on the device so it can communicate with the server.
+ if (cudaError_t err = cudaMemcpy(rpc_client, &client, sizeof(rpc::Client),
+ cudaMemcpyHostToDevice))
handle_error(err);
cudaStream_t stream;
@@ -274,28 +228,25 @@ but the following example shows how it can be used by a standard user.
// While the kernel is executing, check the RPC server for work to do.
// Requires non-blocking CUDA kernels but avoids a separate thread.
- while (cudaStreamQuery(stream) == cudaErrorNotReady)
- if (rpc_status_t err = rpc_handle_server(device))
- handle_error(err);
-
- // Shut down the server running on the given device.
- if (rpc_status_t err =
- rpc_server_shutdown(device, free_host, /*data=*/nullptr))
- handle_error(err);
-
- return EXIT_SUCCESS;
+ do {
+ auto port = server.try_open(warp_size, /*index=*/0);
+ // From libllvmlibc_rpc_server.a in the installation.
+ if (port)
+ handle_libc_opcodes(*port, warp_size);
+ } while (cudaStreamQuery(stream) == cudaErrorNotReady);
}
The above code must be compiled in CUDA's relocatable device code mode and with
the advanced offloading driver to link in the library. Currently this can be
done with the following invocation. Using LTO avoids the overhead normally
-associated with relocatable device code linking.
+associated with relocatable device code linking. The C library for GPUs is
+linked in by forwarding the static library to the device-side link job.
.. code-block:: sh
- $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart -lcgpu-nvptx \
+ $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart \
-I<install-path>include -L<install-path>/lib -lllvmlibc_rpc_server \
- -O3 -foffload-lto -o hello
+ -Xoffload-linker -lc -O3 -foffload-lto -o hello
$> ./hello
Hello world!
@@ -304,4 +255,5 @@ Extensions
The opcode is a 32-bit integer that must be unique to the requested operation.
All opcodes used by ``libc`` internally have the character ``c`` in the most
-significant byte.
+significant byte. Any other opcode is available for use outside of the ``libc``
+implementation.
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index e56b6f634bb31e..1c1f9c9bfb0c69 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -99,39 +99,6 @@ threads and two blocks.
Including the wrapper headers, linking the C library, and running the :ref:`RPC
server<libc_gpu_rpc>` are all handled automatically by the compiler and runtime.
-Binary format
-^^^^^^^^^^^^^
-
-The ``libcgpu.a`` static archive is a fat-binary containing LLVM-IR for each
-supported target device. The supported architectures can be seen using LLVM's
-``llvm-objdump`` with the ``--offloading`` flag:
-
-.. code-block:: sh
-
- $> llvm-objdump --offloading libcgpu-amdgpu.a
- libcgpu-amdgpu.a(strcmp.cpp.o): file format elf64-x86-64
-
- OFFLOADING IMAGE [0]:
- kind llvm ir
- arch generic
- triple amdgcn-amd-amdhsa
- producer none
- ...
-
-Because the device code is stored inside a fat binary, it can be difficult to
-inspect the resulting code. This can be done using the following utilities:
-
-.. code-block:: sh
-
- $> llvm-ar x libcgpu.a strcmp.cpp.o
- $> clang-offload-packager strcmp.cpp.o --image=arch=generic,file=strcmp.bc
- $> opt -S out.bc
- ...
-
-Please note that this fat binary format is provided for compatibility with
-existing offloading toolchains. The implementation in ``libc`` does not depend
-on any existing offloading languages and is completely freestanding.
-
Direct compilation
------------------
@@ -246,7 +213,7 @@ compilation. Using link time optimization will help hide this.
.. code-block:: sh
- $> clang hello.c --target=nvptx64-nvidia-cuda -mcpu=native -flto -lc <install>/lib/nvptx64-nvidia-cuda/crt1.o
+ $> clang hello.c --target=nvptx64-nvidia-cuda -march=native -flto -lc <install>/lib/nvptx64-nvidia-cuda/crt1.o
$> nvptx-loader --threads 2 --blocks 2 a.out
Hello from NVPTX!
Hello from NVPTX!
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice to see the examples looking much simpler after the recent changes, thanks!
Summary:
A few of these were out of date, update them now that the C library
interface into RPC was deleted.