[Clang] Minor fixes to 'gpuintrin.h' header

jhuber6 · jhuber6 · commit 17d1523207c6 · 2025-01-28T12:07:02.000-06:00
Summary:
The bitmask gives different results to the AMDGPU implementation so it's
not needed. Also fix some comments and casts.
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
@@ -158,16 +158,16 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
          ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
 }
 
-// Returns true if the flat pointer points to CUDA 'shared' memory.
+// Returns true if the flat pointer points to AMDGPU 'shared' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
-  return __builtin_amdgcn_is_shared((void __attribute__((address_space(0))) *)((
+  return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
       void [[clang::opencl_generic]] *)ptr));
 }
 
-// Returns true if the flat pointer points to CUDA 'local' memory.
+// Returns true if the flat pointer points to AMDGPU 'private' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
-  return __builtin_amdgcn_is_private((void __attribute__((
-      address_space(0))) *)((void [[clang::opencl_generic]] *)ptr));
+  return __builtin_amdgcn_is_private((void [[clang::address_space(0)]] *)((
+      void [[clang::opencl_generic]] *)ptr));
 }
 
 // Terminates execution of the associated wavefront.
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
@@ -151,9 +151,7 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
-  uint32_t __bitmask = (__mask >> __idx) & 1u;
-  return -__bitmask &
-         __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
 }
 
 // Shuffles the the lanes inside the warp according to the given index.
@@ -162,10 +160,9 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
   uint32_t __mask = (uint32_t)__lane_mask;
-  uint64_t __bitmask = (__mask >> __idx) & 1u;
-  return -__bitmask & ((uint64_t)__nvvm_shfl_sync_idx_i32(
-                           __mask, __hi, __idx, __gpu_num_lanes() - 1u)
-                       << 32ull) |
+  return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
+                                             __gpu_num_lanes() - 1u)
+          << 32ull) |
          ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
                                              __gpu_num_lanes() - 1u));
 }

Original file line number	Diff line number	Diff line change
`@@ -158,16 +158,16 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {`
`158`	`158`	`((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));`
`159`	`159`	`}`
`160`	`160`
`161`		`-// Returns true if the flat pointer points to CUDA 'shared' memory.`
	`161`	`+// Returns true if the flat pointer points to AMDGPU 'shared' memory.`
`162`	`162`	`_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {`
`163`		`- return __builtin_amdgcn_is_shared((void __attribute__((address_space(0))) *)((`
	`163`	`+ return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((`
`164`	`164`	`void [[clang::opencl_generic]] *)ptr));`
`165`	`165`	`}`
`166`	`166`
`167`		`-// Returns true if the flat pointer points to CUDA 'local' memory.`
	`167`	`+// Returns true if the flat pointer points to AMDGPU 'private' memory.`
`168`	`168`	`_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {`
`169`		`- return __builtin_amdgcn_is_private((void __attribute__((`
`170`		`- address_space(0))) )((void [[clang::opencl_generic]] )ptr));`
	`169`	`+ return __builtin_amdgcn_is_private((void [[clang::address_space(0)]] *)((`
	`170`	`+ void [[clang::opencl_generic]] *)ptr));`
`171`	`171`	`}`
`172`	`172`
`173`	`173`	`// Terminates execution of the associated wavefront.`