Skip to content

Commit f727e6a

Browse files
committed
Support for u64
1 parent 1703f01 commit f727e6a

File tree

2 files changed

+46
-0
lines changed

2 files changed

+46
-0
lines changed

clang/lib/Headers/amdgpuintrin.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,15 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
115115
return __builtin_amdgcn_readfirstlane(__x);
116116
}
117117

118+
// Copies the value from the first active thread in the wavefront to the rest.
119+
_DEFAULT_FN_ATTRS __inline__ uint64_t
120+
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
121+
uint32_t __hi = (uint32_t)(__x >> 32ull);
122+
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
123+
return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
124+
((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
125+
}
126+
118127
// Returns a bitmask of threads in the current lane for which \p x is true.
119128
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
120129
bool __x) {
@@ -140,6 +149,15 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
140149
return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
141150
}
142151

152+
// Shuffles the the lanes inside the wavefront according to the given index.
153+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
154+
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
155+
uint32_t __hi = (uint32_t)(__x >> 32ull);
156+
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
157+
return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
158+
((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
159+
}
160+
143161
// Terminates execution of the associated wavefront.
144162
_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
145163
__builtin_amdgcn_endpgm();

clang/lib/Headers/nvptxintrin.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,20 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
116116
return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
117117
}
118118

119+
// Copies the value from the first active thread in the warp to the rest.
120+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
121+
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
122+
uint32_t __hi = (uint32_t)(__x >> 32ull);
123+
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
124+
uint32_t __mask = (uint32_t)__lane_mask;
125+
uint32_t __id = __builtin_ffs(__mask) - 1;
126+
return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id,
127+
__gpu_num_lanes() - 1)
128+
<< 32ull) |
129+
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
130+
__gpu_num_lanes() - 1));
131+
}
132+
119133
// Returns a bitmask of threads in the current lane for which \p x is true.
120134
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
121135
bool __x) {
@@ -142,6 +156,20 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
142156
__nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
143157
}
144158

159+
// Shuffles the the lanes inside the warp according to the given index.
160+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
161+
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
162+
uint32_t __hi = (uint32_t)(__x >> 32ull);
163+
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
164+
uint32_t __mask = (uint32_t)__lane_mask;
165+
uint64_t __bitmask = (__mask >> __idx) & 1u;
166+
return -__bitmask & ((uint64_t)__nvvm_shfl_sync_idx_i32(
167+
__mask, __hi, __idx, __gpu_num_lanes() - 1u)
168+
<< 32ull) |
169+
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
170+
__gpu_num_lanes() - 1u));
171+
}
172+
145173
// Terminates execution of the calling thread.
146174
_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
147175
__nvvm_exit();

0 commit comments

Comments
 (0)