Skip to content

Commit 261e564

Browse files
authored
[libc] Add utility functions for warp-level scan and reduction (llvm#84866)
Summary: The GPU uses a SIMT execution model. That means that each value actually belongs to a group of 32 or 64 other lanes executing next to it. These platforms offer some intrinsic fuctions to actually take elements from neighboring lanes. With these we can do parallel scans or reductions. These functions do not have an immediate user, but will be used in the allocator interface that is in-progress and are generally good to have. This patch is a precommit for these new utilitly functions.
1 parent c167a25 commit 261e564

File tree

7 files changed

+111
-0
lines changed

7 files changed

+111
-0
lines changed

libc/src/__support/GPU/amdgpu/utils.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,12 @@ LIBC_INLINE uint32_t get_lane_size() {
145145
__builtin_amdgcn_wave_barrier();
146146
}
147147

148+
/// Shuffles the the lanes inside the wavefront according to the given index.
149+
[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx,
150+
uint32_t x) {
151+
return __builtin_amdgcn_ds_bpermute(idx << 2, x);
152+
}
153+
148154
/// Returns the current value of the GPU's processor clock.
149155
/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
150156
LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }

libc/src/__support/GPU/generic/utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ LIBC_INLINE void sync_threads() {}
6767

6868
LIBC_INLINE void sync_lane(uint64_t) {}
6969

70+
LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t, uint32_t x) { return x; }
71+
7072
LIBC_INLINE uint64_t processor_clock() { return 0; }
7173

7274
LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }

libc/src/__support/GPU/nvptx/utils.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,14 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
126126
__nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
127127
}
128128

129+
/// Shuffles the the lanes inside the warp according to the given index.
130+
[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
131+
uint32_t idx, uint32_t x) {
132+
uint32_t mask = static_cast<uint32_t>(lane_mask);
133+
uint32_t bitmask = (mask >> idx) & 1;
134+
return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
135+
}
136+
129137
/// Returns the current value of the GPU's processor clock.
130138
LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
131139

libc/src/__support/GPU/utils.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,25 @@ LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
3131
return gpu::get_lane_id() == get_first_lane_id(lane_mask);
3232
}
3333

34+
/// Gets the sum of all lanes inside the warp or wavefront.
35+
LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
36+
for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) {
37+
uint32_t index = step + gpu::get_lane_id();
38+
x += gpu::shuffle(lane_mask, index, x);
39+
}
40+
return gpu::broadcast_value(lane_mask, x);
41+
}
42+
43+
/// Gets the accumulator scan of the threads in the warp or wavefront.
44+
LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
45+
for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) {
46+
uint32_t index = gpu::get_lane_id() - step;
47+
uint32_t bitmask = gpu::get_lane_id() >= step;
48+
x += -bitmask & gpu::shuffle(lane_mask, index, x);
49+
}
50+
return x;
51+
}
52+
3453
} // namespace gpu
3554
} // namespace LIBC_NAMESPACE
3655

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
11
add_subdirectory(threads)
2+
if(LIBC_TARGET_OS_IS_GPU)
3+
add_subdirectory(GPU)
4+
endif()
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
add_custom_target(libc-support-gpu-tests)
2+
add_dependencies(libc-integration-tests libc-support-gpu-tests)
3+
4+
add_integration_test(
5+
scan_reduce_test
6+
SUITE libc-support-gpu-tests
7+
SRCS
8+
scan_reduce.cpp
9+
LOADER_ARGS
10+
--threads 64
11+
)
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
//===-- Test for the parallel scan and reduction operations on the GPU ----===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/CPP/bit.h"
10+
#include "src/__support/GPU/utils.h"
11+
#include "test/IntegrationTest/test.h"
12+
13+
using namespace LIBC_NAMESPACE;
14+
15+
static uint32_t sum(uint32_t n) { return n * (n + 1) / 2; }
16+
17+
// Tests a reduction within a convergant warp or wavefront using some known
18+
// values. For example, if every element in the lane is one, then the sum should
19+
// be the size of the warp or wavefront, i.e. 1 + 1 + 1 ... + 1.
20+
static void test_reduce() {
21+
uint64_t mask = gpu::get_lane_mask();
22+
uint32_t x = gpu::reduce(mask, 1);
23+
EXPECT_EQ(x, gpu::get_lane_size());
24+
25+
uint32_t y = gpu::reduce(mask, gpu::get_lane_id());
26+
EXPECT_EQ(y, sum(gpu::get_lane_size() - 1));
27+
28+
uint32_t z = 0;
29+
if (gpu::get_lane_id() % 2)
30+
z = gpu::reduce(gpu::get_lane_mask(), 1);
31+
gpu::sync_lane(mask);
32+
33+
EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_size() / 2 : 0);
34+
}
35+
36+
// Tests an accumulation scan within a convergent warp or wavefront using some
37+
// known values. For example, if every element in the lane is one, then the scan
38+
// should have each element be equivalent to its ID, i.e. 1, 1 + 1, ...
39+
static void test_scan() {
40+
uint64_t mask = gpu::get_lane_mask();
41+
42+
uint32_t x = gpu::scan(mask, 1);
43+
EXPECT_EQ(x, gpu::get_lane_id() + 1);
44+
45+
uint32_t y = gpu::scan(mask, gpu::get_lane_id());
46+
EXPECT_EQ(y, sum(gpu::get_lane_id()));
47+
48+
uint32_t z = 0;
49+
if (gpu::get_lane_id() % 2)
50+
z = gpu::scan(gpu::get_lane_mask(), 1);
51+
gpu::sync_lane(mask);
52+
53+
EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
54+
}
55+
56+
TEST_MAIN(int argc, char **argv, char **envp) {
57+
test_reduce();
58+
59+
test_scan();
60+
61+
return 0;
62+
}

0 commit comments

Comments
 (0)