Skip to content

[Clang] Add width handling for <gpuintrin.h> shuffle helper #125896

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions clang/lib/Headers/amdgpuintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {

// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
uint32_t __width) {
uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
}

// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
<< 32ull) |
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}

// Returns true if the flat pointer points to AMDGPU 'shared' memory.
Expand Down
24 changes: 14 additions & 10 deletions clang/lib/Headers/gpuintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {

// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ float
__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
uint32_t __width) {
return __builtin_bit_cast(
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
__builtin_bit_cast(uint32_t, __x)));
__builtin_bit_cast(uint32_t, __x), __width));
}

// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ double
__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
uint32_t __width) {
return __builtin_bit_cast(
double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
__builtin_bit_cast(uint64_t, __x)));
double,
__gpu_shuffle_idx_u64(__lane_mask, __idx,
__builtin_bit_cast(uint64_t, __x), __width));
}

// Gets the sum of all lanes inside the warp or wavefront.
Expand All @@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
uint64_t __lane_mask, __type __x) { \
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
uint32_t __index = __step + __gpu_lane_id(); \
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
__gpu_num_lanes()); \
} \
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
}
Expand All @@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
uint32_t __index = __gpu_lane_id() - __step; \
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
__x += __builtin_bit_cast( \
__type, \
-bitmask & __builtin_bit_cast(__bitmask_type, \
__gpu_shuffle_idx_##__suffix( \
__lane_mask, __index, __x))); \
__type, -bitmask & __builtin_bit_cast(__bitmask_type, \
__gpu_shuffle_idx_##__suffix( \
__lane_mask, __index, __x, \
__gpu_num_lanes()))); \
} \
return __x; \
}
Expand Down
15 changes: 8 additions & 7 deletions clang/lib/Headers/nvptxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {

// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
uint32_t __width) {
uint32_t __mask = (uint32_t)__lane_mask;
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
((__gpu_num_lanes() - __width) << 8u) | 0x1f);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUIC, the 0x1f replaces the original __gpu_num_lanes() - 1u and assumes that the warp width will never change.
The bits 8-12 contain "mask for logically splitting warps".
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync

How exactly does (__gpu_num_lanes() - __width) create a mask? AFAICT it will only work when __width == __gpu_num_lanes() and the value is 0, or if width == 1 which does make a mask for the __gpu_num_lanes(), but I don't think that's the intent.
Was it supposed to be ((__gpu_num_lanes() - __width) - 1) << 8u ? But that would not be right either, as then with the default __width == __gpu_num_lanes() we'd end up with (0-1) << 8 and have all upper bits set.

Either I'm confused, or the code as written has a bug.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I took it from https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/__clang_cuda_intrinsics.h#L88 and it gave me the output I expected, so I assumed it was right.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm.. Looks like CUDA SDK implements shfl_sync_idx in their own headers the same way:
image

OK, I'm officially confused now, but given that it's been implemented this way for about a decade now, I'm fine keeping it as is, until there's concrete evidence that it's broken.

In practice it probably means that we can't (and don't) really use non-default values for width on NVIDIA GPUs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know if this is okay so I can approve the backport? (Also below is my punishment for forgetting to commit the fix the clang test.)

}

// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
__gpu_num_lanes() - 1u)
return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
__gpu_num_lanes() - 1u));
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}

// Returns true if the flat pointer points to CUDA 'shared' memory.
Expand Down
5 changes: 3 additions & 2 deletions libc/src/__support/GPU/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }

LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }

LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
return __gpu_shuffle_idx_u32(lane_mask, idx, x);
LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
uint32_t width = __gpu_num_lanes()) {
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}

[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
Expand Down
9 changes: 9 additions & 0 deletions libc/test/integration/src/__support/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)

add_integration_test(
shuffle_test
SUITE libc-support-gpu-tests
SRCS
shuffle.cpp
LOADER_ARGS
--threads 64
)
33 changes: 33 additions & 0 deletions libc/test/integration/src/__support/GPU/shuffle.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//===-- Test for the shuffle operations on the GPU ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/CPP/bit.h"
#include "src/__support/GPU/utils.h"
#include "test/IntegrationTest/test.h"

using namespace LIBC_NAMESPACE;

// Test to make sure the shuffle instruction works by doing a simple broadcast.
// Each iteration reduces the width, so it will broadcast to a subset we check.
static void test_shuffle() {
uint64_t mask = gpu::get_lane_mask();
EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size());

uint32_t x = gpu::get_lane_id();
for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2)
EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width);
}

TEST_MAIN(int argc, char **argv, char **envp) {
if (gpu::get_thread_id() >= gpu::get_lane_size())
return 0;

test_shuffle();

return 0;
}