Skip to content

Commit d175ef5

Browse files
committed
[libc] Replace usage of GPU helpers with ones from 'gpuintrin.h'
Summary: These are provided by a resource header now, cut these from the dependencies and only provide the ones we use for RPC.
1 parent 222f6af commit d175ef5

File tree

2 files changed

+85
-20
lines changed

2 files changed

+85
-20
lines changed

libc/src/__support/RPC/rpc.h

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -110,14 +110,14 @@ template <bool Invert> struct Process {
110110

111111
/// Retrieve the inbox state from memory shared between processes.
112112
LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
113-
return gpu::broadcast_value(
113+
return rpc::broadcast_value(
114114
lane_mask, __scoped_atomic_load_n(&inbox[index], __ATOMIC_RELAXED,
115115
__MEMORY_SCOPE_SYSTEM));
116116
}
117117

118118
/// Retrieve the outbox state from memory shared between processes.
119119
LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
120-
return gpu::broadcast_value(
120+
return rpc::broadcast_value(
121121
lane_mask, __scoped_atomic_load_n(&outbox[index], __ATOMIC_RELAXED,
122122
__MEMORY_SCOPE_SYSTEM));
123123
}
@@ -162,7 +162,7 @@ template <bool Invert> struct Process {
162162

163163
/// Attempt to claim the lock at index. Return true on lock taken.
164164
/// lane_mask is a bitmap of the threads in the warp that would hold the
165-
/// single lock on success, e.g. the result of gpu::get_lane_mask()
165+
/// single lock on success, e.g. the result of rpc::get_lane_mask()
166166
/// The lock is held when the n-th bit of the lock bitfield is set.
167167
LIBC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) {
168168
// On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
@@ -173,12 +173,12 @@ template <bool Invert> struct Process {
173173
// There may be threads active which are not in lane mask which must not
174174
// succeed in taking the lock, as otherwise it will leak. This is handled
175175
// by making threads which are not in lane_mask or with 0, a no-op.
176-
uint32_t id = gpu::get_lane_id();
176+
uint32_t id = rpc::get_lane_id();
177177
bool id_in_lane_mask = lane_mask & (1ul << id);
178178

179179
// All threads in the warp call fetch_or. Possibly at the same time.
180180
bool before = set_nth(lock, index, id_in_lane_mask);
181-
uint64_t packed = gpu::ballot(lane_mask, before);
181+
uint64_t packed = rpc::ballot(lane_mask, before);
182182

183183
// If every bit set in lane_mask is also set in packed, every single thread
184184
// in the warp failed to get the lock. Ballot returns unset for threads not
@@ -212,8 +212,8 @@ template <bool Invert> struct Process {
212212
// restrict to a single thread to avoid one thread dropping the lock, then
213213
// an unrelated warp claiming the lock, then a second thread in this warp
214214
// dropping the lock again.
215-
clear_nth(lock, index, gpu::is_first_lane(lane_mask));
216-
gpu::sync_lane(lane_mask);
215+
clear_nth(lock, index, rpc::is_first_lane(lane_mask));
216+
rpc::sync_lane(lane_mask);
217217
}
218218

219219
/// Number of bytes to allocate for an inbox or outbox.
@@ -276,9 +276,9 @@ template <typename F>
276276
LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
277277
uint64_t lane_mask, Buffer *slot) {
278278
if constexpr (is_process_gpu()) {
279-
fn(&slot[gpu::get_lane_id()], gpu::get_lane_id());
279+
fn(&slot[rpc::get_lane_id()], rpc::get_lane_id());
280280
} else {
281-
for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
281+
for (uint32_t i = 0; i < lane_size; i += rpc::get_num_lanes())
282282
if (lane_mask & (1ul << i))
283283
fn(&slot[i], i);
284284
}
@@ -323,7 +323,7 @@ template <bool T> struct Port {
323323

324324
LIBC_INLINE void close() {
325325
// Wait for all lanes to finish using the port.
326-
gpu::sync_lane(lane_mask);
326+
rpc::sync_lane(lane_mask);
327327

328328
// The server is passive, if it own the buffer when it closes we need to
329329
// give ownership back to the client.
@@ -466,7 +466,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
466466
});
467467
uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
468468
uint64_t mask = process.header[index].mask;
469-
while (gpu::ballot(mask, idx < num_sends)) {
469+
while (rpc::ballot(mask, idx < num_sends)) {
470470
send([=](Buffer *buffer, uint32_t id) {
471471
uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
472472
? sizeof(Buffer::data)
@@ -499,7 +499,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
499499
});
500500
uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
501501
uint64_t mask = process.header[index].mask;
502-
while (gpu::ballot(mask, idx < num_recvs)) {
502+
while (rpc::ballot(mask, idx < num_recvs)) {
503503
recv([=](Buffer *buffer, uint32_t id) {
504504
uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
505505
? sizeof(Buffer::data)
@@ -520,13 +520,13 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
520520
template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
521521
// Repeatedly perform a naive linear scan for a port that can be opened to
522522
// send data.
523-
for (uint32_t index = gpu::get_cluster_id();; ++index) {
523+
for (uint32_t index = 0;; ++index) {
524524
// Start from the beginning if we run out of ports to check.
525525
if (index >= process.port_count)
526526
index = 0;
527527

528528
// Attempt to acquire the lock on this index.
529-
uint64_t lane_mask = gpu::get_lane_mask();
529+
uint64_t lane_mask = rpc::get_lane_mask();
530530
if (!process.try_lock(lane_mask, index))
531531
continue;
532532

@@ -540,12 +540,12 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
540540
continue;
541541
}
542542

543-
if (gpu::is_first_lane(lane_mask)) {
543+
if (rpc::is_first_lane(lane_mask)) {
544544
process.header[index].opcode = opcode;
545545
process.header[index].mask = lane_mask;
546546
}
547-
gpu::sync_lane(lane_mask);
548-
return Port(process, lane_mask, gpu::get_lane_size(), index, out);
547+
rpc::sync_lane(lane_mask);
548+
return Port(process, lane_mask, rpc::get_num_lanes(), index, out);
549549
}
550550
}
551551

@@ -555,7 +555,7 @@ LIBC_INLINE cpp::optional<typename Server::Port>
555555
Server::try_open(uint32_t lane_size, uint32_t start) {
556556
// Perform a naive linear scan for a port that has a pending request.
557557
for (uint32_t index = start; index < process.port_count; ++index) {
558-
uint64_t lane_mask = gpu::get_lane_mask();
558+
uint64_t lane_mask = rpc::get_lane_mask();
559559
uint32_t in = process.load_inbox(lane_mask, index);
560560
uint32_t out = process.load_outbox(lane_mask, index);
561561

libc/src/__support/RPC/rpc_util.h

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,22 +10,87 @@
1010
#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
1111

1212
#include "src/__support/CPP/type_traits.h"
13-
#include "src/__support/macros/attributes.h"
1413
#include "src/__support/macros/config.h"
1514
#include "src/__support/threads/sleep.h"
1615

16+
#if defined(__NVPTX__) || defined(__AMDGPU__)
17+
#include <gpuintrin.h>
18+
#define RPC_TARGET_IS_GPU
19+
#endif
20+
1721
namespace LIBC_NAMESPACE_DECL {
1822
namespace rpc {
1923

2024
/// Conditional to indicate if this process is running on the GPU.
2125
LIBC_INLINE constexpr bool is_process_gpu() {
22-
#if defined(__NVPTX__) || defined(__AMDGPU__)
26+
#ifdef RPC_TARGET_IS_GPU
2327
return true;
2428
#else
2529
return false;
2630
#endif
2731
}
2832

33+
/// Wait for all lanes in the group to complete.
34+
LIBC_INLINE void sync_lane(uint64_t lane_mask) {
35+
#ifdef RPC_TARGET_IS_GPU
36+
return __gpu_sync_lane(lane_mask);
37+
#endif
38+
}
39+
40+
/// Copies the value from the first active thread to the rest.
41+
LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
42+
#ifdef RPC_TARGET_IS_GPU
43+
return __gpu_read_first_lane_u32(lane_mask, x);
44+
#else
45+
return x;
46+
#endif
47+
}
48+
49+
/// Returns the number lanes that participate in the RPC interface.
50+
LIBC_INLINE uint32_t get_num_lanes() {
51+
#ifdef RPC_TARGET_IS_GPU
52+
return __gpu_num_lanes();
53+
#else
54+
return 1;
55+
#endif
56+
}
57+
58+
/// Returns the id of the thread inside of an AMD wavefront executing together.
59+
LIBC_INLINE uint64_t get_lane_mask() {
60+
#ifdef RPC_TARGET_IS_GPU
61+
return __gpu_lane_mask();
62+
#else
63+
return 1;
64+
#endif
65+
}
66+
67+
/// Returns the id of the thread inside of an AMD wavefront executing together.
68+
LIBC_INLINE uint32_t get_lane_id() {
69+
#ifdef RPC_TARGET_IS_GPU
70+
return __gpu_lane_id();
71+
#else
72+
return 0;
73+
#endif
74+
}
75+
76+
/// Conditional that is only true for a single thread in a lane.
77+
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
78+
#ifdef RPC_TARGET_IS_GPU
79+
return __gpu_is_first_in_lane(lane_mask);
80+
#else
81+
return true;
82+
#endif
83+
}
84+
85+
/// Returns a bitmask of threads in the current lane for which \p x is true.
86+
LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
87+
#ifdef RPC_TARGET_IS_GPU
88+
return __gpu_ballot(lane_mask, x);
89+
#else
90+
return x;
91+
#endif
92+
}
93+
2994
/// Return \p val aligned "upwards" according to \p align.
3095
template <typename V, typename A>
3196
LIBC_INLINE constexpr V align_up(V val, A align) {

0 commit comments

Comments
 (0)