Skip to content

Commit 222f6af

Browse files
authored
[libc] Remove more libc dependencies from the RPC header (#116437)
Summary: The end goal is to make `rpc.h` a standalone header so that other projects can include it without leaking `libc` internals. I'm trying to replace stuff slowly before pulling it out all at once to reduce the size of the changes. This patch removes the atomic and a few sparse dependencies. Now we mostly rely on the GPU utils, the sleep function, optional, and the type traits. I'll clean these up in future patches. This removed the old stuff I had around the memcpy, but I think that it's not quite as bad as it once was, as it removes a branch and only uses a few extra VGPRs since I believe the builtin memcpy was improved for AMD.
1 parent 748a29f commit 222f6af

File tree

2 files changed

+56
-50
lines changed

2 files changed

+56
-50
lines changed

libc/src/__support/RPC/rpc.h

Lines changed: 50 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919
#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
2020

2121
#include "rpc_util.h"
22-
#include "src/__support/CPP/algorithm.h" // max
23-
#include "src/__support/CPP/atomic.h"
2422
#include "src/__support/CPP/optional.h"
2523
#include "src/__support/GPU/utils.h"
2624
#include "src/__support/macros/config.h"
@@ -30,6 +28,17 @@
3028
namespace LIBC_NAMESPACE_DECL {
3129
namespace rpc {
3230

31+
/// Use scoped atomic variants if they are available for the target.
32+
#if !__has_builtin(__scoped_atomic_load_n)
33+
#define __scoped_atomic_load_n(src, ord, scp) __atomic_load_n(src, ord)
34+
#define __scoped_atomic_store_n(dst, src, ord, scp) \
35+
__atomic_store_n(dst, src, ord)
36+
#define __scoped_atomic_fetch_or(src, val, ord, scp) \
37+
__atomic_fetch_or(src, val, ord)
38+
#define __scoped_atomic_fetch_and(src, val, ord, scp) \
39+
__atomic_fetch_and(src, val, ord)
40+
#endif
41+
3342
/// A fixed size channel used to communicate between the RPC client and server.
3443
struct Buffer {
3544
uint64_t data[8];
@@ -67,18 +76,18 @@ template <bool Invert> struct Process {
6776
LIBC_INLINE ~Process() = default;
6877

6978
uint32_t port_count = 0;
70-
cpp::Atomic<uint32_t> *inbox = nullptr;
71-
cpp::Atomic<uint32_t> *outbox = nullptr;
79+
uint32_t *inbox = nullptr;
80+
uint32_t *outbox = nullptr;
7281
Header *header = nullptr;
7382
Buffer *packet = nullptr;
7483

7584
static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
76-
cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
85+
uint32_t lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
7786

7887
LIBC_INLINE Process(uint32_t port_count, void *buffer)
79-
: port_count(port_count), inbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
88+
: port_count(port_count), inbox(reinterpret_cast<uint32_t *>(
8089
advance(buffer, inbox_offset(port_count)))),
81-
outbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
90+
outbox(reinterpret_cast<uint32_t *>(
8291
advance(buffer, outbox_offset(port_count)))),
8392
header(reinterpret_cast<Header *>(
8493
advance(buffer, header_offset(port_count)))),
@@ -102,15 +111,15 @@ template <bool Invert> struct Process {
102111
/// Retrieve the inbox state from memory shared between processes.
103112
LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
104113
return gpu::broadcast_value(
105-
lane_mask,
106-
inbox[index].load(cpp::MemoryOrder::RELAXED, cpp::MemoryScope::SYSTEM));
114+
lane_mask, __scoped_atomic_load_n(&inbox[index], __ATOMIC_RELAXED,
115+
__MEMORY_SCOPE_SYSTEM));
107116
}
108117

109118
/// Retrieve the outbox state from memory shared between processes.
110119
LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
111-
return gpu::broadcast_value(lane_mask,
112-
outbox[index].load(cpp::MemoryOrder::RELAXED,
113-
cpp::MemoryScope::SYSTEM));
120+
return gpu::broadcast_value(
121+
lane_mask, __scoped_atomic_load_n(&outbox[index], __ATOMIC_RELAXED,
122+
__MEMORY_SCOPE_SYSTEM));
114123
}
115124

116125
/// Signal to the other process that this one is finished with the buffer.
@@ -119,9 +128,9 @@ template <bool Invert> struct Process {
119128
/// cheaper than calling load_outbox to get the value to store.
120129
LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
121130
uint32_t inverted_outbox = !current_outbox;
122-
atomic_thread_fence(cpp::MemoryOrder::RELEASE);
123-
outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED,
124-
cpp::MemoryScope::SYSTEM);
131+
__atomic_thread_fence(__ATOMIC_RELEASE);
132+
__scoped_atomic_store_n(&outbox[index], inverted_outbox, __ATOMIC_RELAXED,
133+
__MEMORY_SCOPE_SYSTEM);
125134
return inverted_outbox;
126135
}
127136

@@ -133,7 +142,7 @@ template <bool Invert> struct Process {
133142
sleep_briefly();
134143
in = load_inbox(lane_mask, index);
135144
}
136-
atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
145+
__atomic_thread_fence(__ATOMIC_ACQUIRE);
137146
}
138147

139148
/// The packet is a linearly allocated array of buffers used to communicate
@@ -155,8 +164,7 @@ template <bool Invert> struct Process {
155164
/// lane_mask is a bitmap of the threads in the warp that would hold the
156165
/// single lock on success, e.g. the result of gpu::get_lane_mask()
157166
/// The lock is held when the n-th bit of the lock bitfield is set.
158-
[[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
159-
uint32_t index) {
167+
LIBC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) {
160168
// On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
161169
// On volta, need to handle differences between the threads running and
162170
// the threads that were detected in the previous call to get_lane_mask()
@@ -190,16 +198,15 @@ template <bool Invert> struct Process {
190198
// inlining the current function.
191199
bool holding_lock = lane_mask != packed;
192200
if (holding_lock)
193-
atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
201+
__atomic_thread_fence(__ATOMIC_ACQUIRE);
194202
return holding_lock;
195203
}
196204

197205
/// Unlock the lock at index. We need a lane sync to keep this function
198206
/// convergent, otherwise the compiler will sink the store and deadlock.
199-
[[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
200-
uint32_t index) {
207+
LIBC_INLINE void unlock(uint64_t lane_mask, uint32_t index) {
201208
// Do not move any writes past the unlock.
202-
atomic_thread_fence(cpp::MemoryOrder::RELEASE);
209+
__atomic_thread_fence(__ATOMIC_RELEASE);
203210

204211
// Use exactly one thread to clear the nth bit in the lock array Must
205212
// restrict to a single thread to avoid one thread dropping the lock, then
@@ -211,7 +218,7 @@ template <bool Invert> struct Process {
211218

212219
/// Number of bytes to allocate for an inbox or outbox.
213220
LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint32_t port_count) {
214-
return port_count * sizeof(cpp::Atomic<uint32_t>);
221+
return port_count * sizeof(uint32_t);
215222
}
216223

217224
/// Number of bytes to allocate for the buffer containing the packets.
@@ -242,24 +249,24 @@ template <bool Invert> struct Process {
242249
}
243250

244251
/// Conditionally set the n-th bit in the atomic bitfield.
245-
LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
246-
uint32_t index, bool cond) {
252+
LIBC_INLINE static constexpr uint32_t set_nth(uint32_t *bits, uint32_t index,
253+
bool cond) {
247254
uint32_t slot = index / NUM_BITS_IN_WORD;
248255
uint32_t bit = index % NUM_BITS_IN_WORD;
249-
return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
250-
cpp::MemoryOrder::RELAXED,
251-
cpp::MemoryScope::DEVICE) &
256+
return __scoped_atomic_fetch_or(&bits[slot],
257+
static_cast<uint32_t>(cond) << bit,
258+
__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE) &
252259
(1u << bit);
253260
}
254261

255262
/// Conditionally clear the n-th bit in the atomic bitfield.
256-
LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
263+
LIBC_INLINE static constexpr uint32_t clear_nth(uint32_t *bits,
257264
uint32_t index, bool cond) {
258265
uint32_t slot = index / NUM_BITS_IN_WORD;
259266
uint32_t bit = index % NUM_BITS_IN_WORD;
260-
return bits[slot].fetch_and(~0u ^ (static_cast<uint32_t>(cond) << bit),
261-
cpp::MemoryOrder::RELAXED,
262-
cpp::MemoryScope::DEVICE) &
267+
return __scoped_atomic_fetch_and(&bits[slot],
268+
~0u ^ (static_cast<uint32_t>(cond) << bit),
269+
__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE) &
263270
(1u << bit);
264271
}
265272
};
@@ -450,7 +457,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
450457
send([&](Buffer *buffer, uint32_t id) {
451458
reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
452459
num_sends = is_process_gpu() ? lane_value(size, id)
453-
: cpp::max(lane_value(size, id), num_sends);
460+
: rpc::max(lane_value(size, id), num_sends);
454461
uint64_t len =
455462
lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
456463
? sizeof(Buffer::data) - sizeof(uint64_t)
@@ -483,7 +490,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
483490
lane_value(dst, id) =
484491
reinterpret_cast<uint8_t *>(alloc(lane_value(size, id)));
485492
num_recvs = is_process_gpu() ? lane_value(size, id)
486-
: cpp::max(lane_value(size, id), num_recvs);
493+
: rpc::max(lane_value(size, id), num_recvs);
487494
uint64_t len =
488495
lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
489496
? sizeof(Buffer::data) - sizeof(uint64_t)
@@ -510,8 +517,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
510517
/// port. Each port instance uses an associated \p opcode to tell the server
511518
/// what to do. The Client interface provides the appropriate lane size to the
512519
/// port using the platform's returned value.
513-
template <uint16_t opcode>
514-
[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
520+
template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
515521
// Repeatedly perform a naive linear scan for a port that can be opened to
516522
// send data.
517523
for (uint32_t index = gpu::get_cluster_id();; ++index) {
@@ -545,7 +551,7 @@ template <uint16_t opcode>
545551

546552
/// Attempts to open a port to use as the server. The server can only open a
547553
/// port if it has a pending receive operation
548-
[[clang::convergent]] LIBC_INLINE cpp::optional<typename Server::Port>
554+
LIBC_INLINE cpp::optional<typename Server::Port>
549555
Server::try_open(uint32_t lane_size, uint32_t start) {
550556
// Perform a naive linear scan for a port that has a pending request.
551557
for (uint32_t index = start; index < process.port_count; ++index) {
@@ -583,6 +589,13 @@ LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
583589
}
584590
}
585591

592+
#if !__has_builtin(__scoped_atomic_load_n)
593+
#undef __scoped_atomic_load_n
594+
#undef __scoped_atomic_store_n
595+
#undef __scoped_atomic_fetch_or
596+
#undef __scoped_atomic_fetch_and
597+
#endif
598+
586599
} // namespace rpc
587600
} // namespace LIBC_NAMESPACE_DECL
588601

libc/src/__support/RPC/rpc_util.h

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,16 @@
1010
#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
1111

1212
#include "src/__support/CPP/type_traits.h"
13-
#include "src/__support/GPU/utils.h"
1413
#include "src/__support/macros/attributes.h"
1514
#include "src/__support/macros/config.h"
16-
#include "src/__support/macros/properties/architectures.h"
1715
#include "src/__support/threads/sleep.h"
18-
#include "src/string/memory_utils/generic/byte_per_byte.h"
19-
#include "src/string/memory_utils/inline_memcpy.h"
2016

2117
namespace LIBC_NAMESPACE_DECL {
2218
namespace rpc {
2319

2420
/// Conditional to indicate if this process is running on the GPU.
2521
LIBC_INLINE constexpr bool is_process_gpu() {
26-
#if defined(LIBC_TARGET_ARCH_IS_GPU)
22+
#if defined(__NVPTX__) || defined(__AMDGPU__)
2723
return true;
2824
#else
2925
return false;
@@ -57,14 +53,11 @@ template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
5753

5854
/// Wrapper around the optimal memory copy implementation for the target.
5955
LIBC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) {
60-
// The built-in memcpy prefers to fully unroll loops. We want to minimize
61-
// resource usage so we use a single nounroll loop implementation.
62-
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
63-
inline_memcpy_byte_per_byte(reinterpret_cast<Ptr>(dst),
64-
reinterpret_cast<CPtr>(src), count);
65-
#else
66-
inline_memcpy(dst, src, count);
67-
#endif
56+
__builtin_memcpy(dst, src, count);
57+
}
58+
59+
template <class T> LIBC_INLINE constexpr const T &max(const T &a, const T &b) {
60+
return (a < b) ? b : a;
6861
}
6962

7063
} // namespace rpc

0 commit comments

Comments
 (0)