Skip to content

Commit ef96e58

Browse files
committed
[libc] Rework the RPC interface to accept runtime wave sizes
Summary: The RPC interface needs to handle an entire warp or wavefront at once. This is currently done by using a compile time constant indicating the size of the buffer, which right now defaults to some value on the client (GPU) side. However, there are currently attempts to move the `libc` library to a single IR build. This is problematic as the size of the wave fronts changes between ISAs on AMDGPU. The builitin `__builtin_amdgcn_wavefrontsize()` will return the appropriate value, but it is only known at runtime now. In order to support this, this patch restructures the packet. Now instead of having an array of arrays, we simply have a large array of buffers and slice it according to the runtime value if we don't know it ahead of time. This also somewhat has the advantage of making the buffer contiguous within a page now that the header has been moved out of it.
1 parent 347ab99 commit ef96e58

File tree

29 files changed

+106
-100
lines changed

29 files changed

+106
-100
lines changed

libc/docs/gpu/rpc.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ done. It can be omitted if asynchronous execution is desired.
125125
.. code-block:: c++
126126

127127
void rpc_host_call(void *fn, void *data, size_t size) {
128-
rpc::Client::Port port = rpc::client.open<RPC_HOST_CALL>();
128+
rpc::Client<>::Port port = rpc::client.open<RPC_HOST_CALL>();
129129
port.send_n(data, size);
130130
port.send([=](rpc::Buffer *buffer) {
131131
buffer->data[0] = reinterpret_cast<uintptr_t>(fn);

libc/src/__support/GPU/amdgpu/utils.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@
1717
namespace LIBC_NAMESPACE {
1818
namespace gpu {
1919

20-
/// The number of threads that execute in lock-step in a lane.
21-
constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
22-
2320
/// Type aliases to the address spaces used by the AMDGPU backend.
2421
template <typename T> using Private = [[clang::opencl_private]] T;
2522
template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -108,8 +105,11 @@ LIBC_INLINE uint64_t get_thread_id() {
108105
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
109106
}
110107

111-
/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
112-
LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
108+
/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
109+
/// and compilation options.
110+
LIBC_INLINE uint32_t get_lane_size() {
111+
return __builtin_amdgcn_wavefrontsize();
112+
}
113113

114114
/// Returns the id of the thread inside of an AMD wavefront executing together.
115115
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {

libc/src/__support/GPU/generic/utils.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616
namespace LIBC_NAMESPACE {
1717
namespace gpu {
1818

19-
constexpr const uint64_t LANE_SIZE = 1;
20-
2119
template <typename T> using Private = T;
2220
template <typename T> using Constant = T;
2321
template <typename T> using Shared = T;
@@ -55,7 +53,7 @@ LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
5553

5654
LIBC_INLINE uint64_t get_thread_id() { return 0; }
5755

58-
LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
56+
LIBC_INLINE uint32_t get_lane_size() { return 1; }
5957

6058
LIBC_INLINE uint32_t get_lane_id() { return 0; }
6159

libc/src/__support/GPU/nvptx/utils.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@
1616
namespace LIBC_NAMESPACE {
1717
namespace gpu {
1818

19-
/// The number of threads that execute in lock-step in a warp.
20-
constexpr const uint64_t LANE_SIZE = 32;
21-
2219
/// Type aliases to the address spaces used by the NVPTX backend.
2320
template <typename T> using Private = [[clang::opencl_private]] T;
2421
template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -95,8 +92,8 @@ LIBC_INLINE uint64_t get_thread_id() {
9592
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
9693
}
9794

98-
/// Returns the size of a CUDA warp.
99-
LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
95+
/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
96+
LIBC_INLINE uint32_t get_lane_size() { return 32; }
10097

10198
/// Returns the id of the thread inside of a CUDA warp executing together.
10299
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {

libc/src/__support/OSUtil/gpu/io.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
namespace LIBC_NAMESPACE {
1515

1616
void write_to_stderr(cpp::string_view msg) {
17-
rpc::Client::Port port = rpc::client.open<RPC_WRITE_TO_STDERR>();
17+
rpc::Client<>::Port port = rpc::client.open<RPC_WRITE_TO_STDERR>();
1818
port.send_n(msg.data(), msg.size());
1919
port.recv([](rpc::Buffer *) { /* void */ });
2020
port.close();

libc/src/__support/OSUtil/gpu/quick_exit.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ namespace LIBC_NAMESPACE {
1818

1919
void quick_exit(int status) {
2020
// We want to first make sure the server is listening before we exit.
21-
rpc::Client::Port port = rpc::client.open<RPC_EXIT>();
21+
rpc::Client<>::Port port = rpc::client.open<RPC_EXIT>();
2222
port.send_and_recv([](rpc::Buffer *) {}, [](rpc::Buffer *) {});
2323
port.send([&](rpc::Buffer *buffer) {
2424
reinterpret_cast<uint32_t *>(buffer->data)[0] = status;

0 commit comments

Comments
 (0)