Skip to content

Commit 1143da2

Browse files
[libc][gpu] Thread divergence fix on volta
The inbox/outbox loads are performed by the current warp, not a single thread. The outbox load indicates whether a port has been successfully opened. If some lanes in the warp think it has and others think the port open failed, as the warp happened to be diverged when the load occurred, all the subsequent control flow will be incorrect. The inbox load indicates whether the machine on the other side of the RPC channel has progressed. If lanes in the warp have different ideas about that, some will try to progress their state transition while others won't. As far as the RPC layer is concerned this is a performance problem and not a correctness one - none of the lanes can start the transition early, only miss it and start late - but in practice the calls layered on top of RPC do not have the interface required to detect this event and retry the load on the stalled lanes, so the calls layered on top will be broken. None of this is broken on amdgpu, but it's likely that the readfirstlane will have beneficial performance properties there. Possible significant enough that it's worth landing this ahead of fixing gpu::broadcast_value on volta. Essentially volta wasn't adequately considered when writing this part of the protocol. It's a bug present in the initial prototype and propagated thus far, because none of the test cases push volta into a warp diverged state in the middle of the RPC sequence. We should have some test cases for volta where port_open and equivalent are called from diverged warps. Reviewed By: jhuber6 Differential Revision: https://reviews.llvm.org/D159276
1 parent 37a3de1 commit 1143da2

File tree

5 files changed

+48
-48
lines changed

5 files changed

+48
-48
lines changed

libc/src/__support/GPU/amdgpu/utils.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,8 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
125125
}
126126

127127
/// Copies the value from the first active thread in the wavefront to the rest.
128-
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
128+
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,
129+
uint32_t x) {
129130
return __builtin_amdgcn_readfirstlane(x);
130131
}
131132

libc/src/__support/GPU/generic/utils.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,9 @@ LIBC_INLINE uint32_t get_lane_id() { return 0; }
6161

6262
LIBC_INLINE uint64_t get_lane_mask() { return 1; }
6363

64-
LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }
64+
LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; }
6565

66-
LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
67-
(void)lane_mask;
68-
return x;
69-
}
66+
LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; }
7067

7168
LIBC_INLINE void sync_threads() {}
7269

libc/src/__support/GPU/nvptx/utils.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -111,25 +111,24 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
111111
}
112112

113113
/// Copies the value from the first active thread in the warp to the rest.
114-
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
115-
// NOTE: This is not sufficient in all cases on Volta hardware or later. The
116-
// lane mask returned here is not always the true lane mask used by the
117-
// intrinsics in cases of incedental or enforced divergence by the user.
118-
uint32_t lane_mask = static_cast<uint32_t>(get_lane_mask());
119-
uint32_t id = __builtin_ffs(lane_mask) - 1;
114+
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
115+
uint32_t x) {
116+
uint32_t mask = static_cast<uint32_t>(lane_mask);
117+
uint32_t id = __builtin_ffs(mask) - 1;
120118
#if __CUDA_ARCH__ >= 600
121-
return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
119+
return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
122120
#else
123121
return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
124122
#endif
125123
}
126124

127125
/// Returns a bitmask of threads in the current lane for which \p x is true.
128126
[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
127+
uint32_t mask = static_cast<uint32_t>(lane_mask);
129128
#if __CUDA_ARCH__ >= 600
130-
return __nvvm_vote_ballot_sync(static_cast<uint32_t>(lane_mask), x);
129+
return __nvvm_vote_ballot_sync(mask, x);
131130
#else
132-
return static_cast<uint32_t>(lane_mask) & __nvvm_vote_ballot(x);
131+
return mask & __nvvm_vote_ballot(x);
133132
#endif
134133
}
135134
/// Waits for all the threads in the block to converge and issues a fence.

libc/src/__support/RPC/rpc.h

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -116,13 +116,15 @@ template <bool Invert, typename Packet> struct Process {
116116
}
117117

118118
/// Retrieve the inbox state from memory shared between processes.
119-
LIBC_INLINE uint32_t load_inbox(uint32_t index) {
120-
return inbox[index].load(cpp::MemoryOrder::RELAXED);
119+
LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) {
120+
return gpu::broadcast_value(lane_mask,
121+
inbox[index].load(cpp::MemoryOrder::RELAXED));
121122
}
122123

123124
/// Retrieve the outbox state from memory shared between processes.
124-
LIBC_INLINE uint32_t load_outbox(uint32_t index) {
125-
return outbox[index].load(cpp::MemoryOrder::RELAXED);
125+
LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) {
126+
return gpu::broadcast_value(lane_mask,
127+
outbox[index].load(cpp::MemoryOrder::RELAXED));
126128
}
127129

128130
/// Signal to the other process that this one is finished with the buffer.
@@ -138,11 +140,11 @@ template <bool Invert, typename Packet> struct Process {
138140

139141
// Given the current outbox and inbox values, wait until the inbox changes
140142
// to indicate that this thread owns the buffer element.
141-
LIBC_INLINE void wait_for_ownership(uint32_t index, uint32_t outbox,
142-
uint32_t in) {
143+
LIBC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index,
144+
uint32_t outbox, uint32_t in) {
143145
while (buffer_unavailable(in, outbox)) {
144146
sleep_briefly();
145-
in = load_inbox(index);
147+
in = load_inbox(lane_mask, index);
146148
}
147149
atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
148150
}
@@ -393,10 +395,10 @@ template <uint32_t lane_size> struct Server {
393395
template <bool T, typename S>
394396
template <typename F>
395397
LIBC_INLINE void Port<T, S>::send(F fill) {
396-
uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);
398+
uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
397399

398400
// We need to wait until we own the buffer before sending.
399-
process.wait_for_ownership(index, out, in);
401+
process.wait_for_ownership(lane_mask, index, out, in);
400402

401403
// Apply the \p fill function to initialize the buffer and release the memory.
402404
invoke_rpc(fill, process.packet[index]);
@@ -416,10 +418,10 @@ LIBC_INLINE void Port<T, S>::recv(U use) {
416418
owns_buffer = false;
417419
}
418420

419-
uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);
421+
uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
420422

421423
// We need to wait until we own the buffer before receiving.
422-
process.wait_for_ownership(index, out, in);
424+
process.wait_for_ownership(lane_mask, index, out, in);
423425

424426
// Apply the \p use function to read the memory out of the buffer.
425427
invoke_rpc(use, process.packet[index]);
@@ -534,8 +536,8 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
534536
if (!process.try_lock(lane_mask, index))
535537
continue;
536538

537-
uint32_t in = process.load_inbox(index);
538-
uint32_t out = process.load_outbox(index);
539+
uint32_t in = process.load_inbox(lane_mask, index);
540+
uint32_t out = process.load_outbox(lane_mask, index);
539541

540542
// Once we acquire the index we need to check if we are in a valid sending
541543
// state.
@@ -561,21 +563,21 @@ template <uint32_t lane_size>
561563
Server<lane_size>::try_open() {
562564
// Perform a naive linear scan for a port that has a pending request.
563565
for (uint32_t index = 0; index < process.port_count; ++index) {
564-
uint32_t in = process.load_inbox(index);
565-
uint32_t out = process.load_outbox(index);
566+
uint64_t lane_mask = gpu::get_lane_mask();
567+
uint32_t in = process.load_inbox(lane_mask, index);
568+
uint32_t out = process.load_outbox(lane_mask, index);
566569

567570
// The server is passive, if there is no work pending don't bother
568571
// opening a port.
569572
if (process.buffer_unavailable(in, out))
570573
continue;
571574

572575
// Attempt to acquire the lock on this index.
573-
uint64_t lane_mask = gpu::get_lane_mask();
574576
if (!process.try_lock(lane_mask, index))
575577
continue;
576578

577-
in = process.load_inbox(index);
578-
out = process.load_outbox(index);
579+
in = process.load_inbox(lane_mask, index);
580+
out = process.load_outbox(lane_mask, index);
579581

580582
if (process.buffer_unavailable(in, out)) {
581583
process.unlock(lane_mask, index);

libc/test/src/__support/RPC/rpc_smoke_test.cpp

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,36 +49,37 @@ TEST(LlvmLibcRPCSmoke, SanityCheck) {
4949
EXPECT_TRUE(ProcB.try_lock(lane_mask, index));
5050

5151
// All zero to begin with
52-
EXPECT_EQ(ProcA.load_inbox(index), 0u);
53-
EXPECT_EQ(ProcB.load_inbox(index), 0u);
54-
EXPECT_EQ(ProcA.load_outbox(index), 0u);
55-
EXPECT_EQ(ProcB.load_outbox(index), 0u);
52+
EXPECT_EQ(ProcA.load_inbox(lane_mask, index), 0u);
53+
EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 0u);
54+
EXPECT_EQ(ProcA.load_outbox(lane_mask, index), 0u);
55+
EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);
5656

5757
// Available for ProcA and not for ProcB
58-
EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(index),
59-
ProcA.load_outbox(index)));
60-
EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(index),
61-
ProcB.load_outbox(index)));
58+
EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
59+
ProcA.load_outbox(lane_mask, index)));
60+
EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
61+
ProcB.load_outbox(lane_mask, index)));
6262

6363
// ProcA write to outbox
64-
uint32_t ProcAOutbox = ProcA.load_outbox(index);
64+
uint32_t ProcAOutbox = ProcA.load_outbox(lane_mask, index);
6565
EXPECT_EQ(ProcAOutbox, 0u);
6666
ProcAOutbox = ProcA.invert_outbox(index, ProcAOutbox);
6767
EXPECT_EQ(ProcAOutbox, 1u);
6868

6969
// No longer available for ProcA
70-
EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(index), ProcAOutbox));
70+
EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
71+
ProcAOutbox));
7172

7273
// Outbox is still zero, hasn't been written to
73-
EXPECT_EQ(ProcB.load_outbox(index), 0u);
74+
EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);
7475

7576
// Wait for ownership will terminate because load_inbox returns 1
76-
EXPECT_EQ(ProcB.load_inbox(index), 1u);
77-
ProcB.wait_for_ownership(index, 0u, 0u);
77+
EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 1u);
78+
ProcB.wait_for_ownership(lane_mask, index, 0u, 0u);
7879

7980
// and B now has the buffer available
80-
EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(index),
81-
ProcB.load_outbox(index)));
81+
EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
82+
ProcB.load_outbox(lane_mask, index)));
8283

8384
// Enough checks for one test, close the locks
8485
ProcA.unlock(lane_mask, index);

0 commit comments

Comments
 (0)