Skip to content

Commit 6cc7ca0

Browse files
authored
[Clang] Fix cross-lane scan when given divergent lanes (llvm#127703)
Summary: The scan operation implemented here only works if there are contiguous ones in the executation mask that can be used to propagate the result. There are two solutions to this, one is to enter 'whole-wave-mode' and forcibly turn them back on, or to do this serially. This implementation does the latter because it's more portable, but checks to see if the parallel fast-path is applicable. Needs to be backported for correct behavior and because it fixes a failing libc test.
1 parent 0127f16 commit 6cc7ca0

File tree

3 files changed

+102
-26
lines changed

3 files changed

+102
-26
lines changed

clang/lib/Headers/gpuintrin.h

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -150,35 +150,33 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
150150
__builtin_bit_cast(uint64_t, __x), __width));
151151
}
152152

153-
// Gets the sum of all lanes inside the warp or wavefront.
154-
#define __DO_LANE_SUM(__type, __suffix) \
155-
_DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
156-
uint64_t __lane_mask, __type __x) { \
157-
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
158-
uint32_t __index = __step + __gpu_lane_id(); \
159-
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
160-
__gpu_num_lanes()); \
161-
} \
162-
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
163-
}
164-
__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
165-
__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
166-
__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
167-
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
168-
#undef __DO_LANE_SUM
169-
170153
// Gets the accumulator scan of the threads in the warp or wavefront.
171154
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
172155
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
173156
uint64_t __lane_mask, uint32_t __x) { \
174-
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
175-
uint32_t __index = __gpu_lane_id() - __step; \
176-
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
177-
__x += __builtin_bit_cast( \
178-
__type, -bitmask & __builtin_bit_cast(__bitmask_type, \
179-
__gpu_shuffle_idx_##__suffix( \
180-
__lane_mask, __index, __x, \
181-
__gpu_num_lanes()))); \
157+
uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
158+
bool __divergent = __gpu_read_first_lane_##__suffix( \
159+
__lane_mask, __first & (__first + 1)); \
160+
if (__divergent) { \
161+
__type __accum = 0; \
162+
for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
163+
__type __index = __builtin_ctzll(__mask); \
164+
__type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
165+
__gpu_num_lanes()); \
166+
__x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
167+
__accum += __tmp; \
168+
} \
169+
} else { \
170+
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
171+
uint32_t __index = __gpu_lane_id() - __step; \
172+
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
173+
__x += __builtin_bit_cast( \
174+
__type, \
175+
-bitmask & __builtin_bit_cast(__bitmask_type, \
176+
__gpu_shuffle_idx_##__suffix( \
177+
__lane_mask, __index, __x, \
178+
__gpu_num_lanes()))); \
179+
} \
182180
} \
183181
return __x; \
184182
}
@@ -188,6 +186,32 @@ __DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x)
188186
__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x)
189187
#undef __DO_LANE_SCAN
190188

189+
// Gets the sum of all lanes inside the warp or wavefront.
190+
#define __DO_LANE_SUM(__type, __suffix) \
191+
_DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
192+
uint64_t __lane_mask, __type __x) { \
193+
uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
194+
bool __divergent = __gpu_read_first_lane_##__suffix( \
195+
__lane_mask, __first & (__first + 1)); \
196+
if (__divergent) { \
197+
return __gpu_shuffle_idx_##__suffix( \
198+
__lane_mask, 63 - __builtin_clzll(__lane_mask), \
199+
__gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \
200+
} else { \
201+
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
202+
uint32_t __index = __step + __gpu_lane_id(); \
203+
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
204+
__gpu_num_lanes()); \
205+
} \
206+
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
207+
} \
208+
}
209+
__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
210+
__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
211+
__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
212+
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
213+
#undef __DO_LANE_SUM
214+
191215
_Pragma("omp end declare variant");
192216
_Pragma("omp end declare target");
193217

clang/lib/Headers/nvptxintrin.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,11 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
155155
_DEFAULT_FN_ATTRS static __inline__ uint32_t
156156
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
157157
uint32_t __width) {
158+
// Mask out inactive lanes to match AMDGPU behavior.
158159
uint32_t __mask = (uint32_t)__lane_mask;
159-
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
160+
bool __bitmask = (1ull << __idx) & __lane_mask;
161+
return -__bitmask &
162+
__nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
160163
((__gpu_num_lanes() - __width) << 8u) | 0x1f);
161164
}
162165

libc/test/integration/src/__support/GPU/scan_reduce.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,59 @@ static void test_scan() {
5353
EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
5454
}
5555

56+
static uint32_t random(uint64_t *rand_next) {
57+
uint64_t x = *rand_next;
58+
x ^= x >> 12;
59+
x ^= x << 25;
60+
x ^= x >> 27;
61+
*rand_next = x;
62+
return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32);
63+
}
64+
65+
// Scan operations can break down under thread divergence, make sure that the
66+
// function works under some random divergence. We do this by trivially
67+
// implementing a scan with shared scratch memory and then comparing the
68+
// results.
69+
static void test_scan_divergent() {
70+
static uint32_t input[64] = {0};
71+
static uint32_t result[64] = {0};
72+
uint64_t state = gpu::processor_clock() + __gpu_lane_id();
73+
74+
for (int i = 0; i < 64; ++i) {
75+
uint64_t lanemask = gpu::get_lane_mask();
76+
if (random(&state) & (1ull << gpu::get_lane_id())) {
77+
uint64_t divergent = gpu::get_lane_mask();
78+
uint32_t value = random(&state) % 256;
79+
input[gpu::get_lane_id()] = value;
80+
81+
if (gpu::is_first_lane(divergent)) {
82+
uint32_t accumulator = 0;
83+
for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) {
84+
uint32_t tmp = input[lane];
85+
result[lane] = tmp + accumulator;
86+
accumulator += tmp;
87+
}
88+
}
89+
gpu::sync_lane(divergent);
90+
91+
uint32_t scan = gpu::scan(divergent, value);
92+
EXPECT_EQ(scan, result[gpu::get_lane_id()]);
93+
}
94+
if (gpu::is_first_lane(lanemask))
95+
__builtin_memset(input, 0, sizeof(input));
96+
gpu::sync_lane(lanemask);
97+
}
98+
}
99+
56100
TEST_MAIN(int argc, char **argv, char **envp) {
101+
if (gpu::get_thread_id() >= gpu::get_lane_size())
102+
return 0;
103+
57104
test_reduce();
58105

59106
test_scan();
60107

108+
test_scan_divergent();
109+
61110
return 0;
62111
}

0 commit comments

Comments
 (0)