@@ -111,26 +111,54 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
111
111
return __gpu_lane_id () == __gpu_first_lane_id (__lane_mask );
112
112
}
113
113
114
+ // Gets the first floating point value from the active lanes.
115
+ _DEFAULT_FN_ATTRS static __inline__ float
116
+ __gpu_shuffle_idx_f32 (uint64_t __lane_mask , float __x ) {
117
+ return __builtin_bit_cast (
118
+ float ,
119
+ __gpu_shuffle_idx_u32 (__lane_mask , __builtin_bit_cast (uint32_t , __x )));
120
+ }
121
+
122
+ // Gets the first floating point value from the active lanes.
123
+ _DEFAULT_FN_ATTRS static __inline__ double
124
+ __gpu_shuffle_idx_f64 (uint64_t __lane_mask , double __x ) {
125
+ return __builtin_bit_cast (
126
+ double ,
127
+ __gpu_shuffle_idx_u64 (__lane_mask , __builtin_bit_cast (uint64_t , __x )));
128
+ }
129
+
114
130
// Gets the sum of all lanes inside the warp or wavefront.
115
- _DEFAULT_FN_ATTRS static __inline__ uint32_t
116
- __gpu_lane_reduce_u32 (uint64_t __lane_mask , uint32_t x ) {
117
- for (uint32_t step = __gpu_num_lanes () / 2 ; step > 0 ; step /= 2 ) {
118
- uint32_t index = step + __gpu_lane_id ();
119
- x += __gpu_shuffle_idx_u32 (__lane_mask , index , x );
131
+ #define __DO_LANE_REDUCE (__type , __suffix ) \
132
+ _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_reduce_##__suffix( \
133
+ uint64_t __lane_mask, __type x) { \
134
+ for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) { \
135
+ uint32_t index = step + __gpu_lane_id(); \
136
+ x += __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
137
+ } \
138
+ return __gpu_read_first_lane_##__suffix(__lane_mask, x); \
120
139
}
121
- return __gpu_read_first_lane_u32 (__lane_mask , x );
122
- }
140
+ __DO_LANE_REDUCE (uint32_t , u32 );
141
+ __DO_LANE_REDUCE (uint64_t , u64 );
142
+ __DO_LANE_REDUCE (float , f32 );
143
+ __DO_LANE_REDUCE (double , f64 );
144
+ #undef __DO_LANE_REDUCE
123
145
124
146
// Gets the accumulator scan of the threads in the warp or wavefront.
125
- _DEFAULT_FN_ATTRS static __inline__ uint32_t
126
- __gpu_lane_scan_u32 (uint64_t __lane_mask , uint32_t x ) {
127
- for (uint32_t step = 1 ; step < __gpu_num_lanes (); step *= 2 ) {
128
- uint32_t index = __gpu_lane_id () - step ;
129
- uint32_t bitmask = __gpu_lane_id () >= step ;
130
- x += - bitmask & __gpu_shuffle_idx_u32 (__lane_mask , index , x );
147
+ #define __DO_LANE_SCAN (__type , __bitmask_type , __suffix ) \
148
+ _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
149
+ uint64_t __lane_mask, uint32_t x) { \
150
+ for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) { \
151
+ uint32_t index = __gpu_lane_id() - step; \
152
+ __bitmask_type bitmask = __gpu_lane_id() >= step; \
153
+ x += -bitmask & __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
154
+ } \
155
+ return x; \
131
156
}
132
- return x ;
133
- }
157
+ __DO_LANE_SCAN (uint32_t , uint32_t , u32 );
158
+ __DO_LANE_SCAN (uint64_t , uint64_t , u64 );
159
+ __DO_LANE_SCAN (float , uint32_t , f32 );
160
+ __DO_LANE_SCAN (double , uint64_t , f64 );
161
+ #undef __DO_LANE_SCAN
134
162
135
163
_Pragma ("omp end declare variant" );
136
164
_Pragma ("omp end declare target" );
0 commit comments