[libc][gpu] Thread divergence fix on volta

JonChesterfield · JonChesterfield · commit 1143da22450a · 2023-08-31T14:34:02.000+01:00
The inbox/outbox loads are performed by the current warp, not a single thread. The outbox load indicates whether a port has been successfully opened. If some lanes in the warp think it has and others think the port open failed, as the warp happened to be diverged when the load occurred, all the subsequent control flow will be incorrect. The inbox load indicates whether the machine on the other side of the RPC channel has progressed. If lanes in the warp have different ideas about that, some will try to progress their state transition while others won't. As far as the RPC layer is concerned this is a performance problem and not a correctness one - none of the lanes can start the transition early, only miss it and start late - but in practice the calls layered on top of RPC do not have the interface required to detect this event and retry the load on the stalled lanes, so the calls layered on top will be broken. None of this is broken on amdgpu, but it's likely that the readfirstlane will have beneficial performance properties there. Possible significant enough that it's worth landing this ahead of fixing gpu::broadcast_value on volta. Essentially volta wasn't adequately considered when writing this part of the protocol. It's a bug present in the initial prototype and propagated thus far, because none of the test cases push volta into a warp diverged state in the middle of the RPC sequence. We should have some test cases for volta where port_open and equivalent are called from diverged warps. Reviewed By: jhuber6 Differential Revision: https://reviews.llvm.org/D159276
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
@@ -125,7 +125,8 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 }
 
 /// Copies the value from the first active thread in the wavefront to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,
+                                                           uint32_t x) {
   return __builtin_amdgcn_readfirstlane(x);
 }
 
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
@@ -61,12 +61,9 @@ LIBC_INLINE uint32_t get_lane_id() { return 0; }
 
 LIBC_INLINE uint64_t get_lane_mask() { return 1; }
 
-LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }
+LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; }
 
-LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  (void)lane_mask;
-  return x;
-}
+LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; }
 
 LIBC_INLINE void sync_threads() {}
 
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
@@ -111,25 +111,24 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 }
 
 /// Copies the value from the first active thread in the warp to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
-  // NOTE: This is not sufficient in all cases on Volta hardware or later. The
-  // lane mask returned here is not always the true lane mask used by the
-  // intrinsics in cases of incedental or enforced divergence by the user.
-  uint32_t lane_mask = static_cast<uint32_t>(get_lane_mask());
-  uint32_t id = __builtin_ffs(lane_mask) - 1;
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
+                                                           uint32_t x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  uint32_t id = __builtin_ffs(mask) - 1;
 #if __CUDA_ARCH__ >= 600
-  return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
+  return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
 #else
   return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
 #endif
 }
 
 /// Returns a bitmask of threads in the current lane for which \p x is true.
 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
 #if __CUDA_ARCH__ >= 600
-  return __nvvm_vote_ballot_sync(static_cast<uint32_t>(lane_mask), x);
+  return __nvvm_vote_ballot_sync(mask, x);
 #else
-  return static_cast<uint32_t>(lane_mask) & __nvvm_vote_ballot(x);
+  return mask & __nvvm_vote_ballot(x);
 #endif
 }
 /// Waits for all the threads in the block to converge and issues a fence.
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
@@ -116,13 +116,15 @@ template <bool Invert, typename Packet> struct Process {
   }
 
   /// Retrieve the inbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_inbox(uint32_t index) {
-    return inbox[index].load(cpp::MemoryOrder::RELAXED);
+  LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) {
+    return gpu::broadcast_value(lane_mask,
+                                inbox[index].load(cpp::MemoryOrder::RELAXED));
   }
 
   /// Retrieve the outbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_outbox(uint32_t index) {
-    return outbox[index].load(cpp::MemoryOrder::RELAXED);
+  LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) {
+    return gpu::broadcast_value(lane_mask,
+                                outbox[index].load(cpp::MemoryOrder::RELAXED));
   }
 
   /// Signal to the other process that this one is finished with the buffer.
@@ -138,11 +140,11 @@ template <bool Invert, typename Packet> struct Process {
 
   // Given the current outbox and inbox values, wait until the inbox changes
   // to indicate that this thread owns the buffer element.
-  LIBC_INLINE void wait_for_ownership(uint32_t index, uint32_t outbox,
-                                      uint32_t in) {
+  LIBC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index,
+                                      uint32_t outbox, uint32_t in) {
     while (buffer_unavailable(in, outbox)) {
       sleep_briefly();
-      in = load_inbox(index);
+      in = load_inbox(lane_mask, index);
     }
     atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
   }
@@ -393,10 +395,10 @@ template <uint32_t lane_size> struct Server {
 template <bool T, typename S>
 template <typename F>
 LIBC_INLINE void Port<T, S>::send(F fill) {
-  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);
+  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
 
   // We need to wait until we own the buffer before sending.
-  process.wait_for_ownership(index, out, in);
+  process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p fill function to initialize the buffer and release the memory.
   invoke_rpc(fill, process.packet[index]);
@@ -416,10 +418,10 @@ LIBC_INLINE void Port<T, S>::recv(U use) {
     owns_buffer = false;
   }
 
-  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);
+  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
 
   // We need to wait until we own the buffer before receiving.
-  process.wait_for_ownership(index, out, in);
+  process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p use function to read the memory out of the buffer.
   invoke_rpc(use, process.packet[index]);
@@ -534,8 +536,8 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
     if (!process.try_lock(lane_mask, index))
       continue;
 
-    uint32_t in = process.load_inbox(index);
-    uint32_t out = process.load_outbox(index);
+    uint32_t in = process.load_inbox(lane_mask, index);
+    uint32_t out = process.load_outbox(lane_mask, index);
 
     // Once we acquire the index we need to check if we are in a valid sending
     // state.
@@ -561,21 +563,21 @@ template <uint32_t lane_size>
     Server<lane_size>::try_open() {
   // Perform a naive linear scan for a port that has a pending request.
   for (uint32_t index = 0; index < process.port_count; ++index) {
-    uint32_t in = process.load_inbox(index);
-    uint32_t out = process.load_outbox(index);
+    uint64_t lane_mask = gpu::get_lane_mask();
+    uint32_t in = process.load_inbox(lane_mask, index);
+    uint32_t out = process.load_outbox(lane_mask, index);
 
     // The server is passive, if there is no work pending don't bother
     // opening a port.
     if (process.buffer_unavailable(in, out))
       continue;
 
     // Attempt to acquire the lock on this index.
-    uint64_t lane_mask = gpu::get_lane_mask();
     if (!process.try_lock(lane_mask, index))
       continue;
 
-    in = process.load_inbox(index);
-    out = process.load_outbox(index);
+    in = process.load_inbox(lane_mask, index);
+    out = process.load_outbox(lane_mask, index);
 
     if (process.buffer_unavailable(in, out)) {
       process.unlock(lane_mask, index);
diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
@@ -49,36 +49,37 @@ TEST(LlvmLibcRPCSmoke, SanityCheck) {
   EXPECT_TRUE(ProcB.try_lock(lane_mask, index));
 
   // All zero to begin with
-  EXPECT_EQ(ProcA.load_inbox(index), 0u);
-  EXPECT_EQ(ProcB.load_inbox(index), 0u);
-  EXPECT_EQ(ProcA.load_outbox(index), 0u);
-  EXPECT_EQ(ProcB.load_outbox(index), 0u);
+  EXPECT_EQ(ProcA.load_inbox(lane_mask, index), 0u);
+  EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 0u);
+  EXPECT_EQ(ProcA.load_outbox(lane_mask, index), 0u);
+  EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);
 
   // Available for ProcA and not for ProcB
-  EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(index),
-                                        ProcA.load_outbox(index)));
-  EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(index),
-                                       ProcB.load_outbox(index)));
+  EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
+                                        ProcA.load_outbox(lane_mask, index)));
+  EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
+                                       ProcB.load_outbox(lane_mask, index)));
 
   // ProcA write to outbox
-  uint32_t ProcAOutbox = ProcA.load_outbox(index);
+  uint32_t ProcAOutbox = ProcA.load_outbox(lane_mask, index);
   EXPECT_EQ(ProcAOutbox, 0u);
   ProcAOutbox = ProcA.invert_outbox(index, ProcAOutbox);
   EXPECT_EQ(ProcAOutbox, 1u);
 
   // No longer available for ProcA
-  EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(index), ProcAOutbox));
+  EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
+                                       ProcAOutbox));
 
   // Outbox is still zero, hasn't been written to
-  EXPECT_EQ(ProcB.load_outbox(index), 0u);
+  EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);
 
   // Wait for ownership will terminate because load_inbox returns 1
-  EXPECT_EQ(ProcB.load_inbox(index), 1u);
-  ProcB.wait_for_ownership(index, 0u, 0u);
+  EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 1u);
+  ProcB.wait_for_ownership(lane_mask, index, 0u, 0u);
 
   // and B now has the buffer available
-  EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(index),
-                                        ProcB.load_outbox(index)));
+  EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
+                                        ProcB.load_outbox(lane_mask, index)));
 
   // Enough checks for one test, close the locks
   ProcA.unlock(lane_mask, index);

Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,8 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }`
`125`	`125`	`}`
`126`	`126`
`127`	`127`	`/// Copies the value from the first active thread in the wavefront to the rest.`
`128`		`-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {`
	`128`	`+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,`
	`129`	`+ uint32_t x) {`
`129`	`130`	`return __builtin_amdgcn_readfirstlane(x);`
`130`	`131`	`}`
`131`	`132`