Skip to content

Commit 17d1523

Browse files
committed
[Clang] Minor fixes to 'gpuintrin.h' header
Summary: The bitmask gives different results to the AMDGPU implementation so it's not needed. Also fix some comments and casts.
1 parent bb95335 commit 17d1523

File tree

2 files changed

+9
-12
lines changed

2 files changed

+9
-12
lines changed

clang/lib/Headers/amdgpuintrin.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -158,16 +158,16 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
158158
((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
159159
}
160160

161-
// Returns true if the flat pointer points to CUDA 'shared' memory.
161+
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
162162
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
163-
return __builtin_amdgcn_is_shared((void __attribute__((address_space(0))) *)((
163+
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
164164
void [[clang::opencl_generic]] *)ptr));
165165
}
166166

167-
// Returns true if the flat pointer points to CUDA 'local' memory.
167+
// Returns true if the flat pointer points to AMDGPU 'private' memory.
168168
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
169-
return __builtin_amdgcn_is_private((void __attribute__((
170-
address_space(0))) *)((void [[clang::opencl_generic]] *)ptr));
169+
return __builtin_amdgcn_is_private((void [[clang::address_space(0)]] *)((
170+
void [[clang::opencl_generic]] *)ptr));
171171
}
172172

173173
// Terminates execution of the associated wavefront.

clang/lib/Headers/nvptxintrin.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,7 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
151151
_DEFAULT_FN_ATTRS static __inline__ uint32_t
152152
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
153153
uint32_t __mask = (uint32_t)__lane_mask;
154-
uint32_t __bitmask = (__mask >> __idx) & 1u;
155-
return -__bitmask &
156-
__nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
154+
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
157155
}
158156

159157
// Shuffles the the lanes inside the warp according to the given index.
@@ -162,10 +160,9 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
162160
uint32_t __hi = (uint32_t)(__x >> 32ull);
163161
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
164162
uint32_t __mask = (uint32_t)__lane_mask;
165-
uint64_t __bitmask = (__mask >> __idx) & 1u;
166-
return -__bitmask & ((uint64_t)__nvvm_shfl_sync_idx_i32(
167-
__mask, __hi, __idx, __gpu_num_lanes() - 1u)
168-
<< 32ull) |
163+
return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
164+
__gpu_num_lanes() - 1u)
165+
<< 32ull) |
169166
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
170167
__gpu_num_lanes() - 1u));
171168
}

0 commit comments

Comments
 (0)