-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Clang] Add width handling for <gpuintrin.h> shuffle helper #125896
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Summary: The CUDA impelementation has long supported the `width` argument on its shuffle instrucitons, which makes it more difficult to replace those uses with this helper. This patch just correctly implements that for AMDGPU and NVPTX so it's equivalent to `__shfl_sync` in CUDA. This will ease porting.
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-libc Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/125896.diff 4 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 038605605462f8..9dad99ffe9439a 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
- return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+ return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
}
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
- return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+ return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
+ << 32ull) |
+ ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4c463c333308fc..11c87e85cd4975 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
+ uint32_t __width) {
return __builtin_bit_cast(
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
- __builtin_bit_cast(uint32_t, __x)));
+ __builtin_bit_cast(uint32_t, __x), __width));
}
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
+ uint32_t __width) {
return __builtin_bit_cast(
- double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
- __builtin_bit_cast(uint64_t, __x)));
+ double,
+ __gpu_shuffle_idx_u64(__lane_mask, __idx,
+ __builtin_bit_cast(uint64_t, __x), __width));
}
// Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
uint64_t __lane_mask, __type __x) { \
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
uint32_t __index = __step + __gpu_lane_id(); \
- __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
+ __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+ __gpu_num_lanes()); \
} \
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
}
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
uint32_t __index = __gpu_lane_id() - __step; \
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
__x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x))); \
+ __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
+ __gpu_shuffle_idx_##__suffix( \
+ __lane_mask, __index, __x, \
+ __gpu_num_lanes()))); \
} \
return __x; \
}
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb2864eab6a09d..40fa2edebe975c 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
uint32_t __mask = (uint32_t)__lane_mask;
- return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+ return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+ ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
}
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
- return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
- __gpu_num_lanes() - 1u)
+ return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
- ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
- __gpu_num_lanes() - 1u));
+ ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to CUDA 'shared' memory.
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index e138c84c0cb22d..323c003f1ff074 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
-LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
- return __gpu_shuffle_idx_u32(lane_mask, idx, x);
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
+ uint32_t width = __gpu_num_lanes()) {
+ return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
|
@llvm/pr-subscribers-backend-x86 Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/125896.diff 4 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 038605605462f8..9dad99ffe9439a 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
- return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+ return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
}
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
- return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+ return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
+ << 32ull) |
+ ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4c463c333308fc..11c87e85cd4975 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
+ uint32_t __width) {
return __builtin_bit_cast(
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
- __builtin_bit_cast(uint32_t, __x)));
+ __builtin_bit_cast(uint32_t, __x), __width));
}
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
+ uint32_t __width) {
return __builtin_bit_cast(
- double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
- __builtin_bit_cast(uint64_t, __x)));
+ double,
+ __gpu_shuffle_idx_u64(__lane_mask, __idx,
+ __builtin_bit_cast(uint64_t, __x), __width));
}
// Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
uint64_t __lane_mask, __type __x) { \
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
uint32_t __index = __step + __gpu_lane_id(); \
- __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
+ __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+ __gpu_num_lanes()); \
} \
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
}
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
uint32_t __index = __gpu_lane_id() - __step; \
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
__x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x))); \
+ __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
+ __gpu_shuffle_idx_##__suffix( \
+ __lane_mask, __index, __x, \
+ __gpu_num_lanes()))); \
} \
return __x; \
}
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb2864eab6a09d..40fa2edebe975c 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
uint32_t __mask = (uint32_t)__lane_mask;
- return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+ return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+ ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
}
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
- return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
- __gpu_num_lanes() - 1u)
+ return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
- ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
- __gpu_num_lanes() - 1u));
+ ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to CUDA 'shared' memory.
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index e138c84c0cb22d..323c003f1ff074 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
-LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
- return __gpu_shuffle_idx_u32(lane_mask, idx, x);
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
+ uint32_t width = __gpu_num_lanes()) {
+ return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
test?
I could add one in |
Alternatively I could show the codegen, but I don't think that really shows more than the source already does. |
Also it will be tested in OpenMP once this lands and I update OpenMP to use it. |
libc seems to be preferable here because it is changed anyway. |
The test works on AMD and NVPTX. |
I think the libc precommit is broken. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/144/builds/17457 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/140/builds/16361 Here is the relevant piece of the build log for the reference
|
uint32_t __mask = (uint32_t)__lane_mask; | ||
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u); | ||
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, | ||
((__gpu_num_lanes() - __width) << 8u) | 0x1f); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUIC, the 0x1f
replaces the original __gpu_num_lanes() - 1u
and assumes that the warp width will never change.
The bits 8-12 contain "mask for logically splitting warps".
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
How exactly does (__gpu_num_lanes() - __width)
create a mask? AFAICT it will only work when __width == __gpu_num_lanes()
and the value is 0, or if width == 1
which does make a mask for the __gpu_num_lanes()
, but I don't think that's the intent.
Was it supposed to be ((__gpu_num_lanes() - __width) - 1) << 8u
? But that would not be right either, as then with the default __width == __gpu_num_lanes()
we'd end up with (0-1) << 8
and have all upper bits set.
Either I'm confused, or the code as written has a bug.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I took it from https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/__clang_cuda_intrinsics.h#L88 and it gave me the output I expected, so I assumed it was right.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm.. Looks like CUDA SDK implements shfl_sync_idx in their own headers the same way:
OK, I'm officially confused now, but given that it's been implemented this way for about a decade now, I'm fine keeping it as is, until there's concrete evidence that it's broken.
In practice it probably means that we can't (and don't) really use non-default values for width on NVIDIA GPUs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you know if this is okay so I can approve the backport? (Also below is my punishment for forgetting to commit the fix the clang test.)
/pull-request #125912 |
Fix came in an hour ago, it's just the backlog coming in. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/125/builds/5430 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/193/builds/5476 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/23/builds/7317 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/35/builds/7012 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/39/builds/4349 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/63/builds/3896 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/145/builds/4947 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/110/builds/3791 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/169/builds/8163 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/42/builds/3120 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/13/builds/5131 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/18007 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/18912 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/127/builds/2262 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/136/builds/2668 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/143/builds/5216 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/17/builds/5607 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/198/builds/1795 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/64/builds/2185 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/22031 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/161/builds/4566 Here is the relevant piece of the build log for the reference
|
) Summary: The CUDA impelementation has long supported the `width` argument on its shuffle instrucitons, which makes it more difficult to replace those uses with this helper. This patch just correctly implements that for AMDGPU and NVPTX so it's equivalent to `__shfl_sync` in CUDA. This will ease porting. Fortunately these get optimized out correctly when passing in known widths. (cherry picked from commit 2d8106c)
) Summary: The CUDA impelementation has long supported the `width` argument on its shuffle instrucitons, which makes it more difficult to replace those uses with this helper. This patch just correctly implements that for AMDGPU and NVPTX so it's equivalent to `__shfl_sync` in CUDA. This will ease porting. Fortunately these get optimized out correctly when passing in known widths.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/98/builds/1124 Here is the relevant piece of the build log for the reference
|
Summary:
The CUDA impelementation has long supported the
width
argument on itsshuffle instrucitons, which makes it more difficult to replace those
uses with this helper. This patch just correctly implements that for
AMDGPU and NVPTX so it's equivalent to
__shfl_sync
in CUDA. This willease porting.
Fortunately these get optimized out correctly when passing in known widths.