Skip to content

Commit 1ee32d2

Browse files
jhuber6tstellar
authored andcommitted
[Clang] Add width handling for <gpuintrin.h> shuffle helper (llvm#125896)
Summary: The CUDA impelementation has long supported the `width` argument on its shuffle instrucitons, which makes it more difficult to replace those uses with this helper. This patch just correctly implements that for AMDGPU and NVPTX so it's equivalent to `__shfl_sync` in CUDA. This will ease porting. Fortunately these get optimized out correctly when passing in known widths. (cherry picked from commit 2d8106c)
1 parent 6195c3a commit 1ee32d2

File tree

6 files changed

+76
-24
lines changed

6 files changed

+76
-24
lines changed

clang/lib/Headers/amdgpuintrin.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
145145

146146
// Shuffles the the lanes inside the wavefront according to the given index.
147147
_DEFAULT_FN_ATTRS static __inline__ uint32_t
148-
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
149-
return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
148+
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
149+
uint32_t __width) {
150+
uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
151+
return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
150152
}
151153

152154
// Shuffles the the lanes inside the wavefront according to the given index.
153155
_DEFAULT_FN_ATTRS static __inline__ uint64_t
154-
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
156+
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
157+
uint32_t __width) {
155158
uint32_t __hi = (uint32_t)(__x >> 32ull);
156159
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
157-
return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
158-
((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
160+
return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
161+
<< 32ull) |
162+
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
159163
}
160164

161165
// Returns true if the flat pointer points to AMDGPU 'shared' memory.

clang/lib/Headers/gpuintrin.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
133133

134134
// Shuffles the the lanes according to the given index.
135135
_DEFAULT_FN_ATTRS static __inline__ float
136-
__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
136+
__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
137+
uint32_t __width) {
137138
return __builtin_bit_cast(
138139
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
139-
__builtin_bit_cast(uint32_t, __x)));
140+
__builtin_bit_cast(uint32_t, __x), __width));
140141
}
141142

142143
// Shuffles the the lanes according to the given index.
143144
_DEFAULT_FN_ATTRS static __inline__ double
144-
__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
145+
__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
146+
uint32_t __width) {
145147
return __builtin_bit_cast(
146-
double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
147-
__builtin_bit_cast(uint64_t, __x)));
148+
double,
149+
__gpu_shuffle_idx_u64(__lane_mask, __idx,
150+
__builtin_bit_cast(uint64_t, __x), __width));
148151
}
149152

150153
// Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
153156
uint64_t __lane_mask, __type __x) { \
154157
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
155158
uint32_t __index = __step + __gpu_lane_id(); \
156-
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
159+
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
160+
__gpu_num_lanes()); \
157161
} \
158162
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
159163
}
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
171175
uint32_t __index = __gpu_lane_id() - __step; \
172176
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
173177
__x += __builtin_bit_cast( \
174-
__type, \
175-
-bitmask & __builtin_bit_cast(__bitmask_type, \
176-
__gpu_shuffle_idx_##__suffix( \
177-
__lane_mask, __index, __x))); \
178+
__type, -bitmask & __builtin_bit_cast(__bitmask_type, \
179+
__gpu_shuffle_idx_##__suffix( \
180+
__lane_mask, __index, __x, \
181+
__gpu_num_lanes()))); \
178182
} \
179183
return __x; \
180184
}

clang/lib/Headers/nvptxintrin.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
149149

150150
// Shuffles the the lanes inside the warp according to the given index.
151151
_DEFAULT_FN_ATTRS static __inline__ uint32_t
152-
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
152+
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
153+
uint32_t __width) {
153154
uint32_t __mask = (uint32_t)__lane_mask;
154-
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
155+
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
156+
((__gpu_num_lanes() - __width) << 8u) | 0x1f);
155157
}
156158

157159
// Shuffles the the lanes inside the warp according to the given index.
158160
_DEFAULT_FN_ATTRS static __inline__ uint64_t
159-
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
161+
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
162+
uint32_t __width) {
160163
uint32_t __hi = (uint32_t)(__x >> 32ull);
161164
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
162165
uint32_t __mask = (uint32_t)__lane_mask;
163-
return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
164-
__gpu_num_lanes() - 1u)
166+
return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
165167
<< 32ull) |
166-
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
167-
__gpu_num_lanes() - 1u));
168+
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
168169
}
169170

170171
// Returns true if the flat pointer points to CUDA 'shared' memory.

libc/src/__support/GPU/utils.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
8787

8888
LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
8989

90-
LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
91-
return __gpu_shuffle_idx_u32(lane_mask, idx, x);
90+
LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
91+
uint32_t width = __gpu_num_lanes()) {
92+
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
9293
}
9394

9495
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }

libc/test/integration/src/__support/GPU/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,12 @@ add_integration_test(
99
LOADER_ARGS
1010
--threads 64
1111
)
12+
13+
add_integration_test(
14+
shuffle_test
15+
SUITE libc-support-gpu-tests
16+
SRCS
17+
shuffle.cpp
18+
LOADER_ARGS
19+
--threads 64
20+
)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
//===-- Test for the shuffle operations on the GPU ------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/CPP/bit.h"
10+
#include "src/__support/GPU/utils.h"
11+
#include "test/IntegrationTest/test.h"
12+
13+
using namespace LIBC_NAMESPACE;
14+
15+
// Test to make sure the shuffle instruction works by doing a simple broadcast.
16+
// Each iteration reduces the width, so it will broadcast to a subset we check.
17+
static void test_shuffle() {
18+
uint64_t mask = gpu::get_lane_mask();
19+
EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size());
20+
21+
uint32_t x = gpu::get_lane_id();
22+
for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2)
23+
EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width);
24+
}
25+
26+
TEST_MAIN(int argc, char **argv, char **envp) {
27+
if (gpu::get_thread_id() >= gpu::get_lane_size())
28+
return 0;
29+
30+
test_shuffle();
31+
32+
return 0;
33+
}

0 commit comments

Comments
 (0)