@@ -110,14 +110,14 @@ template <bool Invert> struct Process {
110
110
111
111
// / Retrieve the inbox state from memory shared between processes.
112
112
LIBC_INLINE uint32_t load_inbox (uint64_t lane_mask, uint32_t index) const {
113
- return gpu ::broadcast_value (
113
+ return rpc ::broadcast_value (
114
114
lane_mask, __scoped_atomic_load_n (&inbox[index], __ATOMIC_RELAXED,
115
115
__MEMORY_SCOPE_SYSTEM));
116
116
}
117
117
118
118
// / Retrieve the outbox state from memory shared between processes.
119
119
LIBC_INLINE uint32_t load_outbox (uint64_t lane_mask, uint32_t index) const {
120
- return gpu ::broadcast_value (
120
+ return rpc ::broadcast_value (
121
121
lane_mask, __scoped_atomic_load_n (&outbox[index], __ATOMIC_RELAXED,
122
122
__MEMORY_SCOPE_SYSTEM));
123
123
}
@@ -162,7 +162,7 @@ template <bool Invert> struct Process {
162
162
163
163
// / Attempt to claim the lock at index. Return true on lock taken.
164
164
// / lane_mask is a bitmap of the threads in the warp that would hold the
165
- // / single lock on success, e.g. the result of gpu ::get_lane_mask()
165
+ // / single lock on success, e.g. the result of rpc ::get_lane_mask()
166
166
// / The lock is held when the n-th bit of the lock bitfield is set.
167
167
LIBC_INLINE bool try_lock (uint64_t lane_mask, uint32_t index) {
168
168
// On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
@@ -173,12 +173,12 @@ template <bool Invert> struct Process {
173
173
// There may be threads active which are not in lane mask which must not
174
174
// succeed in taking the lock, as otherwise it will leak. This is handled
175
175
// by making threads which are not in lane_mask or with 0, a no-op.
176
- uint32_t id = gpu ::get_lane_id ();
176
+ uint32_t id = rpc ::get_lane_id ();
177
177
bool id_in_lane_mask = lane_mask & (1ul << id);
178
178
179
179
// All threads in the warp call fetch_or. Possibly at the same time.
180
180
bool before = set_nth (lock, index, id_in_lane_mask);
181
- uint64_t packed = gpu ::ballot (lane_mask, before);
181
+ uint64_t packed = rpc ::ballot (lane_mask, before);
182
182
183
183
// If every bit set in lane_mask is also set in packed, every single thread
184
184
// in the warp failed to get the lock. Ballot returns unset for threads not
@@ -212,8 +212,8 @@ template <bool Invert> struct Process {
212
212
// restrict to a single thread to avoid one thread dropping the lock, then
213
213
// an unrelated warp claiming the lock, then a second thread in this warp
214
214
// dropping the lock again.
215
- clear_nth (lock, index, gpu ::is_first_lane (lane_mask));
216
- gpu ::sync_lane (lane_mask);
215
+ clear_nth (lock, index, rpc ::is_first_lane (lane_mask));
216
+ rpc ::sync_lane (lane_mask);
217
217
}
218
218
219
219
// / Number of bytes to allocate for an inbox or outbox.
@@ -276,9 +276,9 @@ template <typename F>
276
276
LIBC_INLINE static void invoke_rpc (F &&fn, uint32_t lane_size,
277
277
uint64_t lane_mask, Buffer *slot) {
278
278
if constexpr (is_process_gpu ()) {
279
- fn (&slot[gpu ::get_lane_id ()], gpu ::get_lane_id ());
279
+ fn (&slot[rpc ::get_lane_id ()], rpc ::get_lane_id ());
280
280
} else {
281
- for (uint32_t i = 0 ; i < lane_size; i += gpu::get_lane_size ())
281
+ for (uint32_t i = 0 ; i < lane_size; i += rpc::get_num_lanes ())
282
282
if (lane_mask & (1ul << i))
283
283
fn (&slot[i], i);
284
284
}
@@ -323,7 +323,7 @@ template <bool T> struct Port {
323
323
324
324
LIBC_INLINE void close () {
325
325
// Wait for all lanes to finish using the port.
326
- gpu ::sync_lane (lane_mask);
326
+ rpc ::sync_lane (lane_mask);
327
327
328
328
// The server is passive, if it own the buffer when it closes we need to
329
329
// give ownership back to the client.
@@ -466,7 +466,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
466
466
});
467
467
uint64_t idx = sizeof (Buffer::data) - sizeof (uint64_t );
468
468
uint64_t mask = process.header [index].mask ;
469
- while (gpu ::ballot (mask, idx < num_sends)) {
469
+ while (rpc ::ballot (mask, idx < num_sends)) {
470
470
send ([=](Buffer *buffer, uint32_t id) {
471
471
uint64_t len = lane_value (size, id) - idx > sizeof (Buffer::data)
472
472
? sizeof (Buffer::data)
@@ -499,7 +499,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
499
499
});
500
500
uint64_t idx = sizeof (Buffer::data) - sizeof (uint64_t );
501
501
uint64_t mask = process.header [index].mask ;
502
- while (gpu ::ballot (mask, idx < num_recvs)) {
502
+ while (rpc ::ballot (mask, idx < num_recvs)) {
503
503
recv ([=](Buffer *buffer, uint32_t id) {
504
504
uint64_t len = lane_value (size, id) - idx > sizeof (Buffer::data)
505
505
? sizeof (Buffer::data)
@@ -520,13 +520,13 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
520
520
template <uint16_t opcode> LIBC_INLINE Client::Port Client::open () {
521
521
// Repeatedly perform a naive linear scan for a port that can be opened to
522
522
// send data.
523
- for (uint32_t index = gpu::get_cluster_id () ;; ++index) {
523
+ for (uint32_t index = 0 ;; ++index) {
524
524
// Start from the beginning if we run out of ports to check.
525
525
if (index >= process.port_count )
526
526
index = 0 ;
527
527
528
528
// Attempt to acquire the lock on this index.
529
- uint64_t lane_mask = gpu ::get_lane_mask ();
529
+ uint64_t lane_mask = rpc ::get_lane_mask ();
530
530
if (!process.try_lock (lane_mask, index))
531
531
continue ;
532
532
@@ -540,12 +540,12 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
540
540
continue ;
541
541
}
542
542
543
- if (gpu ::is_first_lane (lane_mask)) {
543
+ if (rpc ::is_first_lane (lane_mask)) {
544
544
process.header [index].opcode = opcode;
545
545
process.header [index].mask = lane_mask;
546
546
}
547
- gpu ::sync_lane (lane_mask);
548
- return Port (process, lane_mask, gpu::get_lane_size (), index, out);
547
+ rpc ::sync_lane (lane_mask);
548
+ return Port (process, lane_mask, rpc::get_num_lanes (), index, out);
549
549
}
550
550
}
551
551
@@ -555,7 +555,7 @@ LIBC_INLINE cpp::optional<typename Server::Port>
555
555
Server::try_open (uint32_t lane_size, uint32_t start) {
556
556
// Perform a naive linear scan for a port that has a pending request.
557
557
for (uint32_t index = start; index < process.port_count ; ++index) {
558
- uint64_t lane_mask = gpu ::get_lane_mask ();
558
+ uint64_t lane_mask = rpc ::get_lane_mask ();
559
559
uint32_t in = process.load_inbox (lane_mask, index);
560
560
uint32_t out = process.load_outbox (lane_mask, index);
561
561
0 commit comments