@@ -153,10 +153,10 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
153
153
} \
154
154
return __gpu_read_first_lane_##__suffix(__lane_mask, x); \
155
155
}
156
- __DO_LANE_REDUCE (uint32_t , u32 );
157
- __DO_LANE_REDUCE (uint64_t , u64 );
158
- __DO_LANE_REDUCE (float , f32 );
159
- __DO_LANE_REDUCE (double , f64 );
156
+ __DO_LANE_REDUCE (uint32_t , u32 ); // uint32_t __gpu_lane_reduce_u32(m, x)
157
+ __DO_LANE_REDUCE (uint64_t , u64 ); // uint64_t __gpu_lane_reduce_u64(m, x)
158
+ __DO_LANE_REDUCE (float , f32 ); // float __gpu_lane_reduce_f32(m, x)
159
+ __DO_LANE_REDUCE (double , f64 ); // double __gpu_lane_reduce_f64(m, x)
160
160
#undef __DO_LANE_REDUCE
161
161
162
162
// Gets the accumulator scan of the threads in the warp or wavefront.
@@ -173,10 +173,10 @@ __DO_LANE_REDUCE(double, f64);
173
173
} \
174
174
return x; \
175
175
}
176
- __DO_LANE_SCAN (uint32_t , uint32_t , u32 );
177
- __DO_LANE_SCAN (uint64_t , uint64_t , u64 );
178
- __DO_LANE_SCAN (float , uint32_t , f32 );
179
- __DO_LANE_SCAN (double , uint64_t , f64 );
176
+ __DO_LANE_SCAN (uint32_t , uint32_t , u32 ); // uint32_t __gpu_lane_scan_u32(m, x)
177
+ __DO_LANE_SCAN (uint64_t , uint64_t , u64 ); // uint64_t __gpu_lane_scan_u64(m, x)
178
+ __DO_LANE_SCAN (float , uint32_t , f32 ); // float __gpu_lane_scan_f32(m, x)
179
+ __DO_LANE_SCAN (double , uint64_t , f64 ); // double __gpu_lane_scan_f64(m, x)
180
180
#undef __DO_LANE_SCAN
181
181
182
182
_Pragma ("omp end declare variant" );
0 commit comments