llvm · jhuber6 · Mar 12, 2024 · Mar 12, 2024 · arsenm · Mar 12, 2024
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
@@ -145,6 +145,12 @@ LIBC_INLINE uint32_t get_lane_size() {
   __builtin_amdgcn_wave_barrier();
 }
 
+/// Shuffles the the lanes inside the wavefront according to the given index.
+[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx,
+                                                   uint32_t x) {
+  return __builtin_amdgcn_ds_bpermute(idx << 2, x);
+}
+
 /// Returns the current value of the GPU's processor clock.
 /// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
 LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }

diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
@@ -67,6 +67,8 @@ LIBC_INLINE void sync_threads() {}
 
 LIBC_INLINE void sync_lane(uint64_t) {}
 
+LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t, uint32_t x) { return x; }
+
 LIBC_INLINE uint64_t processor_clock() { return 0; }
 
 LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }

diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
@@ -126,6 +126,14 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
   __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
 }
 
+/// Shuffles the the lanes inside the warp according to the given index.
+[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
+                                                   uint32_t idx, uint32_t x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  uint32_t bitmask = (mask >> idx) & 1;
+  return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
+}
+
 /// Returns the current value of the GPU's processor clock.
 LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
 

diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
@@ -31,6 +31,25 @@ LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
   return gpu::get_lane_id() == get_first_lane_id(lane_mask);
 }
 
+/// Gets the sum of all lanes inside the warp or wavefront.
+LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
+  for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) {
+    uint32_t index = step + gpu::get_lane_id();
+    x += gpu::shuffle(lane_mask, index, x);
+  }
+  return gpu::broadcast_value(lane_mask, x);
+}
+
+/// Gets the accumulator scan of the threads in the warp or wavefront.
+LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
+  for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) {
+    uint32_t index = gpu::get_lane_id() - step;
+    uint32_t bitmask = gpu::get_lane_id() >= step;
+    x += -bitmask & gpu::shuffle(lane_mask, index, x);
+  }
+  return x;
+}
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE
 

diff --git a/libc/test/integration/src/__support/CMakeLists.txt b/libc/test/integration/src/__support/CMakeLists.txt
@@ -1 +1,4 @@
 add_subdirectory(threads)
+if(LIBC_TARGET_OS_IS_GPU)
+  add_subdirectory(GPU)
+endif()
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-gpu-tests)
+add_dependencies(libc-integration-tests libc-support-gpu-tests)
+
+add_integration_test(
+  scan_reduce_test
+  SUITE libc-support-gpu-tests
+  SRCS
+    scan_reduce.cpp
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -0,0 +1,62 @@
+//===-- Test for the parallel scan and reduction operations on the GPU ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+static uint32_t sum(uint32_t n) { return n * (n + 1) / 2; }
+
+// Tests a reduction within a convergant warp or wavefront using some known
+// values. For example, if every element in the lane is one, then the sum should
+// be the size of the warp or wavefront, i.e. 1 + 1 + 1 ... + 1.
+static void test_reduce() {
+  uint64_t mask = gpu::get_lane_mask();
+  uint32_t x = gpu::reduce(mask, 1);
+  EXPECT_EQ(x, gpu::get_lane_size());
+
+  uint32_t y = gpu::reduce(mask, gpu::get_lane_id());
+  EXPECT_EQ(y, sum(gpu::get_lane_size() - 1));
+
+  uint32_t z = 0;
+  if (gpu::get_lane_id() % 2)
+    z = gpu::reduce(gpu::get_lane_mask(), 1);
+  gpu::sync_lane(mask);
+
+  EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_size() / 2 : 0);
+}
+
+// Tests an accumulation scan within a convergent warp or wavefront using some
+// known values. For example, if every element in the lane is one, then the scan
+// should have each element be equivalent to its ID, i.e. 1, 1 + 1, ...
+static void test_scan() {
+  uint64_t mask = gpu::get_lane_mask();
+
+  uint32_t x = gpu::scan(mask, 1);
+  EXPECT_EQ(x, gpu::get_lane_id() + 1);
+
+  uint32_t y = gpu::scan(mask, gpu::get_lane_id());
+  EXPECT_EQ(y, sum(gpu::get_lane_id()));
+
+  uint32_t z = 0;
+  if (gpu::get_lane_id() % 2)
+    z = gpu::scan(gpu::get_lane_mask(), 1);
+  gpu::sync_lane(mask);
+
+  EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  test_reduce();
+
+  test_scan();
+
+  return 0;
+}