@@ -116,6 +116,20 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
116
116
return __nvvm_shfl_sync_idx_i32 (__mask , __x , __id , __gpu_num_lanes () - 1 );
117
117
}
118
118
119
+ // Copies the value from the first active thread in the warp to the rest.
120
+ _DEFAULT_FN_ATTRS static __inline__ uint64_t
121
+ __gpu_read_first_lane_u64 (uint64_t __lane_mask , uint64_t __x ) {
122
+ uint32_t __hi = (uint32_t )(__x >> 32ull );
123
+ uint32_t __lo = (uint32_t )(__x & 0xFFFFFFFF );
124
+ uint32_t __mask = (uint32_t )__lane_mask ;
125
+ uint32_t __id = __builtin_ffs (__mask ) - 1 ;
126
+ return ((uint64_t )__nvvm_shfl_sync_idx_i32 (__mask , __hi , __id ,
127
+ __gpu_num_lanes () - 1 )
128
+ << 32ull ) |
129
+ ((uint64_t )__nvvm_shfl_sync_idx_i32 (__mask , __lo , __id ,
130
+ __gpu_num_lanes () - 1 ));
131
+ }
132
+
119
133
// Returns a bitmask of threads in the current lane for which \p x is true.
120
134
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot (uint64_t __lane_mask ,
121
135
bool __x ) {
@@ -142,6 +156,20 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
142
156
__nvvm_shfl_sync_idx_i32 (__mask , __x , __idx , __gpu_num_lanes () - 1u );
143
157
}
144
158
159
+ // Shuffles the the lanes inside the warp according to the given index.
160
+ _DEFAULT_FN_ATTRS static __inline__ uint64_t
161
+ __gpu_shuffle_idx_u64 (uint64_t __lane_mask , uint32_t __idx , uint64_t __x ) {
162
+ uint32_t __hi = (uint32_t )(__x >> 32ull );
163
+ uint32_t __lo = (uint32_t )(__x & 0xFFFFFFFF );
164
+ uint32_t __mask = (uint32_t )__lane_mask ;
165
+ uint64_t __bitmask = (__mask >> __idx ) & 1u ;
166
+ return - __bitmask & ((uint64_t )__nvvm_shfl_sync_idx_i32 (
167
+ __mask , __hi , __idx , __gpu_num_lanes () - 1u )
168
+ << 32ull ) |
169
+ ((uint64_t )__nvvm_shfl_sync_idx_i32 (__mask , __lo , __idx ,
170
+ __gpu_num_lanes () - 1u ));
171
+ }
172
+
145
173
// Terminates execution of the calling thread.
146
174
_DEFAULT_FN_ATTRS [[noreturn ]] static __inline__ void __gpu_exit (void ) {
147
175
__nvvm_exit ();
0 commit comments