-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Clang] Add width handling for <gpuintrin.h> shuffle helper #125896
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { | |
|
||
// Shuffles the the lanes inside the warp according to the given index. | ||
_DEFAULT_FN_ATTRS static __inline__ uint32_t | ||
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { | ||
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, | ||
uint32_t __width) { | ||
uint32_t __mask = (uint32_t)__lane_mask; | ||
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u); | ||
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, | ||
((__gpu_num_lanes() - __width) << 8u) | 0x1f); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUIC, the How exactly does Either I'm confused, or the code as written has a bug. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I took it from https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/__clang_cuda_intrinsics.h#L88 and it gave me the output I expected, so I assumed it was right. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm.. Looks like CUDA SDK implements shfl_sync_idx in their own headers the same way: OK, I'm officially confused now, but given that it's been implemented this way for about a decade now, I'm fine keeping it as is, until there's concrete evidence that it's broken. In practice it probably means that we can't (and don't) really use non-default values for width on NVIDIA GPUs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know if this is okay so I can approve the backport? (Also below is my punishment for forgetting to commit the fix the clang test.) |
||
} | ||
|
||
// Shuffles the the lanes inside the warp according to the given index. | ||
_DEFAULT_FN_ATTRS static __inline__ uint64_t | ||
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) { | ||
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, | ||
uint32_t __width) { | ||
uint32_t __hi = (uint32_t)(__x >> 32ull); | ||
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); | ||
uint32_t __mask = (uint32_t)__lane_mask; | ||
return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx, | ||
__gpu_num_lanes() - 1u) | ||
return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) | ||
<< 32ull) | | ||
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx, | ||
__gpu_num_lanes() - 1u)); | ||
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width)); | ||
} | ||
|
||
// Returns true if the flat pointer points to CUDA 'shared' memory. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
//===-- Test for the shuffle operations on the GPU ------------------------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "src/__support/CPP/bit.h" | ||
#include "src/__support/GPU/utils.h" | ||
#include "test/IntegrationTest/test.h" | ||
|
||
using namespace LIBC_NAMESPACE; | ||
|
||
// Test to make sure the shuffle instruction works by doing a simple broadcast. | ||
// Each iteration reduces the width, so it will broadcast to a subset we check. | ||
static void test_shuffle() { | ||
uint64_t mask = gpu::get_lane_mask(); | ||
EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size()); | ||
|
||
uint32_t x = gpu::get_lane_id(); | ||
for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2) | ||
EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width); | ||
} | ||
|
||
TEST_MAIN(int argc, char **argv, char **envp) { | ||
if (gpu::get_thread_id() >= gpu::get_lane_size()) | ||
return 0; | ||
|
||
test_shuffle(); | ||
|
||
return 0; | ||
} |
Uh oh!
There was an error while loading. Please reload this page.