Skip to content

Commit ffc10e8

Browse files
committed
Add floating point versions
1 parent f727e6a commit ffc10e8

File tree

1 file changed

+43
-15
lines changed

1 file changed

+43
-15
lines changed

clang/lib/Headers/gpuintrin.h

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -111,26 +111,54 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
111111
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
112112
}
113113

114+
// Gets the first floating point value from the active lanes.
115+
_DEFAULT_FN_ATTRS static __inline__ float
116+
__gpu_shuffle_idx_f32(uint64_t __lane_mask, float __x) {
117+
return __builtin_bit_cast(
118+
float,
119+
__gpu_shuffle_idx_u32(__lane_mask, __builtin_bit_cast(uint32_t, __x)));
120+
}
121+
122+
// Gets the first floating point value from the active lanes.
123+
_DEFAULT_FN_ATTRS static __inline__ double
124+
__gpu_shuffle_idx_f64(uint64_t __lane_mask, double __x) {
125+
return __builtin_bit_cast(
126+
double,
127+
__gpu_shuffle_idx_u64(__lane_mask, __builtin_bit_cast(uint64_t, __x)));
128+
}
129+
114130
// Gets the sum of all lanes inside the warp or wavefront.
115-
_DEFAULT_FN_ATTRS static __inline__ uint32_t
116-
__gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
117-
for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
118-
uint32_t index = step + __gpu_lane_id();
119-
x += __gpu_shuffle_idx_u32(__lane_mask, index, x);
131+
#define __DO_LANE_REDUCE(__type, __suffix) \
132+
_DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_reduce_##__suffix( \
133+
uint64_t __lane_mask, __type x) { \
134+
for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) { \
135+
uint32_t index = step + __gpu_lane_id(); \
136+
x += __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
137+
} \
138+
return __gpu_read_first_lane_##__suffix(__lane_mask, x); \
120139
}
121-
return __gpu_read_first_lane_u32(__lane_mask, x);
122-
}
140+
__DO_LANE_REDUCE(uint32_t, u32);
141+
__DO_LANE_REDUCE(uint64_t, u64);
142+
__DO_LANE_REDUCE(float, f32);
143+
__DO_LANE_REDUCE(double, f64);
144+
#undef __DO_LANE_REDUCE
123145

124146
// Gets the accumulator scan of the threads in the warp or wavefront.
125-
_DEFAULT_FN_ATTRS static __inline__ uint32_t
126-
__gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
127-
for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
128-
uint32_t index = __gpu_lane_id() - step;
129-
uint32_t bitmask = __gpu_lane_id() >= step;
130-
x += -bitmask & __gpu_shuffle_idx_u32(__lane_mask, index, x);
147+
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
148+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
149+
uint64_t __lane_mask, uint32_t x) { \
150+
for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) { \
151+
uint32_t index = __gpu_lane_id() - step; \
152+
__bitmask_type bitmask = __gpu_lane_id() >= step; \
153+
x += -bitmask & __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
154+
} \
155+
return x; \
131156
}
132-
return x;
133-
}
157+
__DO_LANE_SCAN(uint32_t, uint32_t, u32);
158+
__DO_LANE_SCAN(uint64_t, uint64_t, u64);
159+
__DO_LANE_SCAN(float, uint32_t, f32);
160+
__DO_LANE_SCAN(double, uint64_t, f64);
161+
#undef __DO_LANE_SCAN
134162

135163
_Pragma("omp end declare variant");
136164
_Pragma("omp end declare target");

0 commit comments

Comments
 (0)