Skip to content

Commit 05d6734

Browse files
authored
[Clang] Pass correct lane mask for match helpers (#138693)
Summary: We use the ballot to get the proper lane mask after we've masked off the threads already done. This isn't an issue on AMDGPU but could cause problems for post-Volta since it's saying that threads are active when they aren't.
1 parent 8307d45 commit 05d6734

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

clang/lib/Headers/gpuintrin.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -264,9 +264,10 @@ __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
264264
uint64_t __match_mask = 0;
265265

266266
bool __done = 0;
267-
while (__gpu_ballot(__lane_mask, !__done)) {
267+
for (uint64_t __active_mask = __lane_mask; __active_mask;
268+
__active_mask = __gpu_ballot(__lane_mask, !__done)) {
268269
if (!__done) {
269-
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
270+
uint32_t __first = __gpu_read_first_lane_u32(__active_mask, __x);
270271
if (__first == __x) {
271272
__match_mask = __gpu_lane_mask();
272273
__done = 1;
@@ -283,9 +284,10 @@ __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
283284
uint64_t __match_mask = 0;
284285

285286
bool __done = 0;
286-
while (__gpu_ballot(__lane_mask, !__done)) {
287+
for (uint64_t __active_mask = __lane_mask; __active_mask;
288+
__active_mask = __gpu_ballot(__lane_mask, !__done)) {
287289
if (!__done) {
288-
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
290+
uint64_t __first = __gpu_read_first_lane_u64(__active_mask, __x);
289291
if (__first == __x) {
290292
__match_mask = __gpu_lane_mask();
291293
__done = 1;

0 commit comments

Comments
 (0)