19
19
#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
20
20
21
21
#include " rpc_util.h"
22
- #include " src/__support/CPP/algorithm.h" // max
23
- #include " src/__support/CPP/atomic.h"
24
22
#include " src/__support/CPP/optional.h"
25
23
#include " src/__support/GPU/utils.h"
26
24
#include " src/__support/macros/config.h"
30
28
namespace LIBC_NAMESPACE_DECL {
31
29
namespace rpc {
32
30
31
+ // / Use scoped atomic variants if they are available for the target.
32
+ #if !__has_builtin(__scoped_atomic_load_n)
33
+ #define __scoped_atomic_load_n (src, ord, scp ) __atomic_load_n(src, ord)
34
+ #define __scoped_atomic_store_n (dst, src, ord, scp ) \
35
+ __atomic_store_n (dst, src, ord)
36
+ #define __scoped_atomic_fetch_or (src, val, ord, scp ) \
37
+ __atomic_fetch_or (src, val, ord)
38
+ #define __scoped_atomic_fetch_and (src, val, ord, scp ) \
39
+ __atomic_fetch_and (src, val, ord)
40
+ #endif
41
+
33
42
// / A fixed size channel used to communicate between the RPC client and server.
34
43
struct Buffer {
35
44
uint64_t data[8 ];
@@ -67,18 +76,18 @@ template <bool Invert> struct Process {
67
76
LIBC_INLINE ~Process () = default ;
68
77
69
78
uint32_t port_count = 0 ;
70
- cpp::Atomic< uint32_t > *inbox = nullptr ;
71
- cpp::Atomic< uint32_t > *outbox = nullptr ;
79
+ uint32_t *inbox = nullptr ;
80
+ uint32_t *outbox = nullptr ;
72
81
Header *header = nullptr ;
73
82
Buffer *packet = nullptr ;
74
83
75
84
static constexpr uint64_t NUM_BITS_IN_WORD = sizeof (uint32_t ) * 8 ;
76
- cpp::Atomic< uint32_t > lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0 };
85
+ uint32_t lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0 };
77
86
78
87
LIBC_INLINE Process (uint32_t port_count, void *buffer)
79
- : port_count(port_count), inbox(reinterpret_cast <cpp::Atomic< uint32_t > *>(
88
+ : port_count(port_count), inbox(reinterpret_cast <uint32_t *>(
80
89
advance (buffer, inbox_offset(port_count)))),
81
- outbox(reinterpret_cast <cpp::Atomic< uint32_t > *>(
90
+ outbox(reinterpret_cast <uint32_t *>(
82
91
advance (buffer, outbox_offset(port_count)))),
83
92
header(reinterpret_cast <Header *>(
84
93
advance (buffer, header_offset(port_count)))),
@@ -102,15 +111,15 @@ template <bool Invert> struct Process {
102
111
// / Retrieve the inbox state from memory shared between processes.
103
112
LIBC_INLINE uint32_t load_inbox (uint64_t lane_mask, uint32_t index) const {
104
113
return gpu::broadcast_value (
105
- lane_mask,
106
- inbox[index]. load (cpp::MemoryOrder::RELAXED, cpp::MemoryScope::SYSTEM ));
114
+ lane_mask, __scoped_atomic_load_n (&inbox[index], __ATOMIC_RELAXED,
115
+ __MEMORY_SCOPE_SYSTEM ));
107
116
}
108
117
109
118
// / Retrieve the outbox state from memory shared between processes.
110
119
LIBC_INLINE uint32_t load_outbox (uint64_t lane_mask, uint32_t index) const {
111
- return gpu::broadcast_value (lane_mask,
112
- outbox[index]. load (cpp::MemoryOrder::RELAXED ,
113
- cpp::MemoryScope::SYSTEM ));
120
+ return gpu::broadcast_value (
121
+ lane_mask, __scoped_atomic_load_n (& outbox[index], __ATOMIC_RELAXED ,
122
+ __MEMORY_SCOPE_SYSTEM ));
114
123
}
115
124
116
125
// / Signal to the other process that this one is finished with the buffer.
@@ -119,9 +128,9 @@ template <bool Invert> struct Process {
119
128
// / cheaper than calling load_outbox to get the value to store.
120
129
LIBC_INLINE uint32_t invert_outbox (uint32_t index, uint32_t current_outbox) {
121
130
uint32_t inverted_outbox = !current_outbox;
122
- atomic_thread_fence (cpp::MemoryOrder::RELEASE );
123
- outbox[index]. store ( inverted_outbox, cpp::MemoryOrder::RELAXED ,
124
- cpp::MemoryScope::SYSTEM );
131
+ __atomic_thread_fence (__ATOMIC_RELEASE );
132
+ __scoped_atomic_store_n (& outbox[index], inverted_outbox, __ATOMIC_RELAXED ,
133
+ __MEMORY_SCOPE_SYSTEM );
125
134
return inverted_outbox;
126
135
}
127
136
@@ -133,7 +142,7 @@ template <bool Invert> struct Process {
133
142
sleep_briefly ();
134
143
in = load_inbox (lane_mask, index);
135
144
}
136
- atomic_thread_fence (cpp::MemoryOrder::ACQUIRE );
145
+ __atomic_thread_fence (__ATOMIC_ACQUIRE );
137
146
}
138
147
139
148
// / The packet is a linearly allocated array of buffers used to communicate
@@ -155,8 +164,7 @@ template <bool Invert> struct Process {
155
164
// / lane_mask is a bitmap of the threads in the warp that would hold the
156
165
// / single lock on success, e.g. the result of gpu::get_lane_mask()
157
166
// / The lock is held when the n-th bit of the lock bitfield is set.
158
- [[clang::convergent]] LIBC_INLINE bool try_lock (uint64_t lane_mask,
159
- uint32_t index) {
167
+ LIBC_INLINE bool try_lock (uint64_t lane_mask, uint32_t index) {
160
168
// On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
161
169
// On volta, need to handle differences between the threads running and
162
170
// the threads that were detected in the previous call to get_lane_mask()
@@ -190,16 +198,15 @@ template <bool Invert> struct Process {
190
198
// inlining the current function.
191
199
bool holding_lock = lane_mask != packed;
192
200
if (holding_lock)
193
- atomic_thread_fence (cpp::MemoryOrder::ACQUIRE );
201
+ __atomic_thread_fence (__ATOMIC_ACQUIRE );
194
202
return holding_lock;
195
203
}
196
204
197
205
// / Unlock the lock at index. We need a lane sync to keep this function
198
206
// / convergent, otherwise the compiler will sink the store and deadlock.
199
- [[clang::convergent]] LIBC_INLINE void unlock (uint64_t lane_mask,
200
- uint32_t index) {
207
+ LIBC_INLINE void unlock (uint64_t lane_mask, uint32_t index) {
201
208
// Do not move any writes past the unlock.
202
- atomic_thread_fence (cpp::MemoryOrder::RELEASE );
209
+ __atomic_thread_fence (__ATOMIC_RELEASE );
203
210
204
211
// Use exactly one thread to clear the nth bit in the lock array Must
205
212
// restrict to a single thread to avoid one thread dropping the lock, then
@@ -211,7 +218,7 @@ template <bool Invert> struct Process {
211
218
212
219
// / Number of bytes to allocate for an inbox or outbox.
213
220
LIBC_INLINE static constexpr uint64_t mailbox_bytes (uint32_t port_count) {
214
- return port_count * sizeof (cpp::Atomic< uint32_t > );
221
+ return port_count * sizeof (uint32_t );
215
222
}
216
223
217
224
// / Number of bytes to allocate for the buffer containing the packets.
@@ -242,24 +249,24 @@ template <bool Invert> struct Process {
242
249
}
243
250
244
251
// / Conditionally set the n-th bit in the atomic bitfield.
245
- LIBC_INLINE static constexpr uint32_t set_nth (cpp::Atomic< uint32_t > *bits,
246
- uint32_t index, bool cond) {
252
+ LIBC_INLINE static constexpr uint32_t set_nth (uint32_t *bits, uint32_t index ,
253
+ bool cond) {
247
254
uint32_t slot = index / NUM_BITS_IN_WORD;
248
255
uint32_t bit = index % NUM_BITS_IN_WORD;
249
- return bits[slot]. fetch_or ( static_cast < uint32_t >(cond) << bit ,
250
- cpp::MemoryOrder::RELAXED ,
251
- cpp::MemoryScope::DEVICE ) &
256
+ return __scoped_atomic_fetch_or (& bits[slot],
257
+ static_cast < uint32_t >(cond) << bit ,
258
+ __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE ) &
252
259
(1u << bit);
253
260
}
254
261
255
262
// / Conditionally clear the n-th bit in the atomic bitfield.
256
- LIBC_INLINE static constexpr uint32_t clear_nth (cpp::Atomic< uint32_t > *bits,
263
+ LIBC_INLINE static constexpr uint32_t clear_nth (uint32_t *bits,
257
264
uint32_t index, bool cond) {
258
265
uint32_t slot = index / NUM_BITS_IN_WORD;
259
266
uint32_t bit = index % NUM_BITS_IN_WORD;
260
- return bits[slot]. fetch_and (~ 0u ^ ( static_cast < uint32_t >(cond) << bit) ,
261
- cpp::MemoryOrder::RELAXED ,
262
- cpp::MemoryScope::DEVICE ) &
267
+ return __scoped_atomic_fetch_and (& bits[slot],
268
+ ~ 0u ^ ( static_cast < uint32_t >(cond) << bit) ,
269
+ __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE ) &
263
270
(1u << bit);
264
271
}
265
272
};
@@ -450,7 +457,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
450
457
send ([&](Buffer *buffer, uint32_t id) {
451
458
reinterpret_cast <uint64_t *>(buffer->data )[0 ] = lane_value (size, id);
452
459
num_sends = is_process_gpu () ? lane_value (size, id)
453
- : cpp ::max (lane_value (size, id), num_sends);
460
+ : rpc ::max (lane_value (size, id), num_sends);
454
461
uint64_t len =
455
462
lane_value (size, id) > sizeof (Buffer::data) - sizeof (uint64_t )
456
463
? sizeof (Buffer::data) - sizeof (uint64_t )
@@ -483,7 +490,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
483
490
lane_value (dst, id) =
484
491
reinterpret_cast <uint8_t *>(alloc (lane_value (size, id)));
485
492
num_recvs = is_process_gpu () ? lane_value (size, id)
486
- : cpp ::max (lane_value (size, id), num_recvs);
493
+ : rpc ::max (lane_value (size, id), num_recvs);
487
494
uint64_t len =
488
495
lane_value (size, id) > sizeof (Buffer::data) - sizeof (uint64_t )
489
496
? sizeof (Buffer::data) - sizeof (uint64_t )
@@ -510,8 +517,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
510
517
// / port. Each port instance uses an associated \p opcode to tell the server
511
518
// / what to do. The Client interface provides the appropriate lane size to the
512
519
// / port using the platform's returned value.
513
- template <uint16_t opcode>
514
- [[clang::convergent]] LIBC_INLINE Client::Port Client::open () {
520
+ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open () {
515
521
// Repeatedly perform a naive linear scan for a port that can be opened to
516
522
// send data.
517
523
for (uint32_t index = gpu::get_cluster_id ();; ++index) {
@@ -545,7 +551,7 @@ template <uint16_t opcode>
545
551
546
552
// / Attempts to open a port to use as the server. The server can only open a
547
553
// / port if it has a pending receive operation
548
- [[clang::convergent]] LIBC_INLINE cpp::optional<typename Server::Port>
554
+ LIBC_INLINE cpp::optional<typename Server::Port>
549
555
Server::try_open (uint32_t lane_size, uint32_t start) {
550
556
// Perform a naive linear scan for a port that has a pending request.
551
557
for (uint32_t index = start; index < process.port_count ; ++index) {
@@ -583,6 +589,13 @@ LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
583
589
}
584
590
}
585
591
592
+ #if !__has_builtin(__scoped_atomic_load_n)
593
+ #undef __scoped_atomic_load_n
594
+ #undef __scoped_atomic_store_n
595
+ #undef __scoped_atomic_fetch_or
596
+ #undef __scoped_atomic_fetch_and
597
+ #endif
598
+
586
599
} // namespace rpc
587
600
} // namespace LIBC_NAMESPACE_DECL
588
601
0 commit comments