Skip to content

Commit d48b335

Browse files
committed
Add more functions
1 parent 2386733 commit d48b335

File tree

4 files changed

+56
-22
lines changed

4 files changed

+56
-22
lines changed

clang/lib/Headers/amdgpuintrin.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,28 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
158158
((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
159159
}
160160

161+
// Returns true if the flat pointer points to CUDA 'shared' memory.
162+
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
163+
return __builtin_amdgcn_is_shared(
164+
(void __attribute__((address_space(0))) *)ptr);
165+
}
166+
167+
// Returns true if the flat pointer points to CUDA 'local' memory.
168+
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
169+
return __builtin_amdgcn_is_private(
170+
(void __attribute__((address_space(0))) *)ptr);
171+
}
172+
161173
// Terminates execution of the associated wavefront.
162174
_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
163175
__builtin_amdgcn_endpgm();
164176
}
165177

178+
// Suspend the thread briefly to assist the scheduler during busy loops.
179+
_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
180+
__builtin_amdgcn_s_sleep(2);
181+
}
182+
166183
_Pragma("omp end declare variant");
167184
_Pragma("omp end declare target");
168185

clang/lib/Headers/gpuintrin.h

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -148,34 +148,35 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
148148
}
149149

150150
// Gets the sum of all lanes inside the warp or wavefront.
151-
#define __DO_LANE_REDUCE(__type, __suffix) \
152-
_DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_reduce_##__suffix( \
153-
uint64_t __lane_mask, __type x) { \
154-
for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) { \
155-
uint32_t index = step + __gpu_lane_id(); \
156-
x += __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
151+
#define __DO_LANE_SUM(__type, __suffix) \
152+
_DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
153+
uint64_t __lane_mask, __type __x) { \
154+
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
155+
uint32_t __index = __step + __gpu_lane_id(); \
156+
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
157157
} \
158-
return __gpu_read_first_lane_##__suffix(__lane_mask, x); \
158+
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
159159
}
160-
__DO_LANE_REDUCE(uint32_t, u32); // uint32_t __gpu_lane_reduce_u32(m, x)
161-
__DO_LANE_REDUCE(uint64_t, u64); // uint64_t __gpu_lane_reduce_u64(m, x)
162-
__DO_LANE_REDUCE(float, f32); // float __gpu_lane_reduce_f32(m, x)
163-
__DO_LANE_REDUCE(double, f64); // double __gpu_lane_reduce_f64(m, x)
164-
#undef __DO_LANE_REDUCE
160+
__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
161+
__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
162+
__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
163+
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
164+
#undef __DO_LANE_SUM
165165

166166
// Gets the accumulator scan of the threads in the warp or wavefront.
167167
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
168168
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
169-
uint64_t __lane_mask, uint32_t x) { \
170-
for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) { \
171-
uint32_t index = __gpu_lane_id() - step; \
172-
__bitmask_type bitmask = __gpu_lane_id() >= step; \
173-
x += __builtin_bit_cast( \
174-
__type, -bitmask & __builtin_bit_cast(__bitmask_type, \
175-
__gpu_shuffle_idx_##__suffix( \
176-
__lane_mask, index, x))); \
169+
uint64_t __lane_mask, uint32_t __x) { \
170+
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
171+
uint32_t __index = __gpu_lane_id() - __step; \
172+
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
173+
__x += __builtin_bit_cast( \
174+
__type, \
175+
-bitmask & __builtin_bit_cast(__bitmask_type, \
176+
__gpu_shuffle_idx_##__suffix( \
177+
__lane_mask, __index, __x))); \
177178
} \
178-
return x; \
179+
return __x; \
179180
}
180181
__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
181182
__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)

clang/lib/Headers/nvptxintrin.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,11 +170,27 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
170170
__gpu_num_lanes() - 1u));
171171
}
172172

173+
// Returns true if the flat pointer points to CUDA 'shared' memory.
174+
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
175+
return __nvvm_isspacep_shared(ptr);
176+
}
177+
178+
// Returns true if the flat pointer points to CUDA 'local' memory.
179+
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
180+
return __nvvm_isspacep_local(ptr);
181+
}
182+
173183
// Terminates execution of the calling thread.
174184
_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
175185
__nvvm_exit();
176186
}
177187

188+
// Suspend the thread briefly to assist the scheduler during busy loops.
189+
_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
190+
if (__nvvm_reflect("__CUDA_ARCH") >= 700)
191+
asm("nanosleep.u32 64;" ::: "memory");
192+
}
193+
178194
_Pragma("omp end declare variant");
179195
_Pragma("omp end declare target");
180196

clang/test/Headers/gpuintrin_lang.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
//
1414
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
1515
// RUN: -internal-isystem %S/../../lib/Headers/ \
16-
// RUN: -cl-std=CL3.0 -triple amdgcn -emit-llvm %s -o - \
16+
// RUN: -cl-std=CL2.0 -triple amdgcn -emit-llvm %s -o - \
1717
// RUN: | FileCheck %s --check-prefix=OPENCL
1818
//
1919
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \

0 commit comments

Comments
 (0)