Skip to content

Commit f9f0e83

Browse files
committed
Fix floatoing point scans
1 parent ffc10e8 commit f9f0e83

File tree

1 file changed

+26
-7
lines changed

1 file changed

+26
-7
lines changed

clang/lib/Headers/gpuintrin.h

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,18 +113,34 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
113113

114114
// Gets the first floating point value from the active lanes.
115115
_DEFAULT_FN_ATTRS static __inline__ float
116-
__gpu_shuffle_idx_f32(uint64_t __lane_mask, float __x) {
116+
__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
117117
return __builtin_bit_cast(
118-
float,
119-
__gpu_shuffle_idx_u32(__lane_mask, __builtin_bit_cast(uint32_t, __x)));
118+
float, __gpu_read_first_lane_u32(__lane_mask,
119+
__builtin_bit_cast(uint32_t, __x)));
120120
}
121121

122122
// Gets the first floating point value from the active lanes.
123123
_DEFAULT_FN_ATTRS static __inline__ double
124-
__gpu_shuffle_idx_f64(uint64_t __lane_mask, double __x) {
124+
__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
125125
return __builtin_bit_cast(
126-
double,
127-
__gpu_shuffle_idx_u64(__lane_mask, __builtin_bit_cast(uint64_t, __x)));
126+
double, __gpu_read_first_lane_u64(__lane_mask,
127+
__builtin_bit_cast(uint64_t, __x)));
128+
}
129+
130+
// Gets the first floating point value from the active lanes.
131+
_DEFAULT_FN_ATTRS static __inline__ float
132+
__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
133+
return __builtin_bit_cast(
134+
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
135+
__builtin_bit_cast(uint32_t, __x)));
136+
}
137+
138+
// Gets the first floating point value from the active lanes.
139+
_DEFAULT_FN_ATTRS static __inline__ double
140+
__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
141+
return __builtin_bit_cast(
142+
double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
143+
__builtin_bit_cast(uint64_t, __x)));
128144
}
129145

130146
// Gets the sum of all lanes inside the warp or wavefront.
@@ -150,7 +166,10 @@ __DO_LANE_REDUCE(double, f64);
150166
for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) { \
151167
uint32_t index = __gpu_lane_id() - step; \
152168
__bitmask_type bitmask = __gpu_lane_id() >= step; \
153-
x += -bitmask & __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
169+
x += __builtin_bit_cast( \
170+
__type, -bitmask & __builtin_bit_cast(__bitmask_type, \
171+
__gpu_shuffle_idx_##__suffix( \
172+
__lane_mask, index, x))); \
154173
} \
155174
return x; \
156175
}

0 commit comments

Comments
 (0)