@@ -113,18 +113,34 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
113
113
114
114
// Gets the first floating point value from the active lanes.
115
115
_DEFAULT_FN_ATTRS static __inline__ float
116
- __gpu_shuffle_idx_f32 (uint64_t __lane_mask , float __x ) {
116
+ __gpu_read_first_lane_f32 (uint64_t __lane_mask , float __x ) {
117
117
return __builtin_bit_cast (
118
- float ,
119
- __gpu_shuffle_idx_u32 ( __lane_mask , __builtin_bit_cast (uint32_t , __x )));
118
+ float , __gpu_read_first_lane_u32 ( __lane_mask ,
119
+ __builtin_bit_cast (uint32_t , __x )));
120
120
}
121
121
122
122
// Gets the first floating point value from the active lanes.
123
123
_DEFAULT_FN_ATTRS static __inline__ double
124
- __gpu_shuffle_idx_f64 (uint64_t __lane_mask , double __x ) {
124
+ __gpu_read_first_lane_f64 (uint64_t __lane_mask , double __x ) {
125
125
return __builtin_bit_cast (
126
- double ,
127
- __gpu_shuffle_idx_u64 (__lane_mask , __builtin_bit_cast (uint64_t , __x )));
126
+ double , __gpu_read_first_lane_u64 (__lane_mask ,
127
+ __builtin_bit_cast (uint64_t , __x )));
128
+ }
129
+
130
+ // Gets the first floating point value from the active lanes.
131
+ _DEFAULT_FN_ATTRS static __inline__ float
132
+ __gpu_shuffle_idx_f32 (uint64_t __lane_mask , uint32_t __idx , float __x ) {
133
+ return __builtin_bit_cast (
134
+ float , __gpu_shuffle_idx_u32 (__lane_mask , __idx ,
135
+ __builtin_bit_cast (uint32_t , __x )));
136
+ }
137
+
138
+ // Gets the first floating point value from the active lanes.
139
+ _DEFAULT_FN_ATTRS static __inline__ double
140
+ __gpu_shuffle_idx_f64 (uint64_t __lane_mask , uint32_t __idx , double __x ) {
141
+ return __builtin_bit_cast (
142
+ double , __gpu_shuffle_idx_u64 (__lane_mask , __idx ,
143
+ __builtin_bit_cast (uint64_t , __x )));
128
144
}
129
145
130
146
// Gets the sum of all lanes inside the warp or wavefront.
@@ -150,7 +166,10 @@ __DO_LANE_REDUCE(double, f64);
150
166
for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) { \
151
167
uint32_t index = __gpu_lane_id() - step; \
152
168
__bitmask_type bitmask = __gpu_lane_id() >= step; \
153
- x += -bitmask & __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
169
+ x += __builtin_bit_cast( \
170
+ __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
171
+ __gpu_shuffle_idx_##__suffix( \
172
+ __lane_mask, index, x))); \
154
173
} \
155
174
return x; \
156
175
}
0 commit comments