@@ -148,34 +148,35 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
148
148
}
149
149
150
150
// Gets the sum of all lanes inside the warp or wavefront.
151
- #define __DO_LANE_REDUCE (__type , __suffix ) \
152
- _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_reduce_ ##__suffix( \
153
- uint64_t __lane_mask, __type x ) { \
154
- for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) { \
155
- uint32_t index = step + __gpu_lane_id(); \
156
- x += __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
151
+ #define __DO_LANE_SUM (__type , __suffix ) \
152
+ _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_ ##__suffix( \
153
+ uint64_t __lane_mask, __type __x ) { \
154
+ for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
155
+ uint32_t __index = __step + __gpu_lane_id(); \
156
+ __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
157
157
} \
158
- return __gpu_read_first_lane_##__suffix(__lane_mask, x); \
158
+ return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
159
159
}
160
- __DO_LANE_REDUCE (uint32_t , u32 ); // uint32_t __gpu_lane_reduce_u32 (m, x)
161
- __DO_LANE_REDUCE (uint64_t , u64 ); // uint64_t __gpu_lane_reduce_u64 (m, x)
162
- __DO_LANE_REDUCE (float , f32 ); // float __gpu_lane_reduce_f32 (m, x)
163
- __DO_LANE_REDUCE (double , f64 ); // double __gpu_lane_reduce_f64 (m, x)
164
- #undef __DO_LANE_REDUCE
160
+ __DO_LANE_SUM (uint32_t , u32 ); // uint32_t __gpu_lane_sum_u32 (m, x)
161
+ __DO_LANE_SUM (uint64_t , u64 ); // uint64_t __gpu_lane_sum_u64 (m, x)
162
+ __DO_LANE_SUM (float , f32 ); // float __gpu_lane_sum_f32 (m, x)
163
+ __DO_LANE_SUM (double , f64 ); // double __gpu_lane_sum_f64 (m, x)
164
+ #undef __DO_LANE_SUM
165
165
166
166
// Gets the accumulator scan of the threads in the warp or wavefront.
167
167
#define __DO_LANE_SCAN (__type , __bitmask_type , __suffix ) \
168
168
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
169
- uint64_t __lane_mask, uint32_t x) { \
170
- for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) { \
171
- uint32_t index = __gpu_lane_id() - step; \
172
- __bitmask_type bitmask = __gpu_lane_id() >= step; \
173
- x += __builtin_bit_cast( \
174
- __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
175
- __gpu_shuffle_idx_##__suffix( \
176
- __lane_mask, index, x))); \
169
+ uint64_t __lane_mask, uint32_t __x) { \
170
+ for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
171
+ uint32_t __index = __gpu_lane_id() - __step; \
172
+ __bitmask_type bitmask = __gpu_lane_id() >= __step; \
173
+ __x += __builtin_bit_cast( \
174
+ __type, \
175
+ -bitmask & __builtin_bit_cast(__bitmask_type, \
176
+ __gpu_shuffle_idx_##__suffix( \
177
+ __lane_mask, __index, __x))); \
177
178
} \
178
- return x; \
179
+ return __x; \
179
180
}
180
181
__DO_LANE_SCAN (uint32_t , uint32_t , u32 ); // uint32_t __gpu_lane_scan_u32(m, x)
181
182
__DO_LANE_SCAN (uint64_t , uint64_t , u64 ); // uint64_t __gpu_lane_scan_u64(m, x)
0 commit comments