Skip to content

Commit 4a3348e

Browse files
committed
[Clang] Implement resource directory headers for common GPU intrinsics
Summary: All GPU based languages provide some way to access things like the thread ID or other resources. However, this is spread between many different languages and it varies between targets. The goal here is to provide a resource directory header that just provides these in an easier to understand way, primarily so this can be used for C/C++ code. The interface aims to be common, to faciliate easier porting, but target specific stuff could be put in the individual headers.
1 parent da4b972 commit 4a3348e

File tree

4 files changed

+403
-0
lines changed

4 files changed

+403
-0
lines changed

clang/lib/Headers/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,12 @@ set(x86_files
268268
cpuid.h
269269
)
270270

271+
set(gpu_files
272+
gpuintrin.h
273+
nvptxintrin.h
274+
amdgpuintrin.h
275+
)
276+
271277
set(windows_only_files
272278
intrin0.h
273279
intrin.h
@@ -296,6 +302,7 @@ set(files
296302
${systemz_files}
297303
${ve_files}
298304
${x86_files}
305+
${gpu_files}
299306
${webassembly_files}
300307
${windows_only_files}
301308
${utility_files}
@@ -518,6 +525,7 @@ add_header_target("systemz-resource-headers" "${systemz_files};${zos_wrapper_fil
518525
add_header_target("ve-resource-headers" "${ve_files}")
519526
add_header_target("webassembly-resource-headers" "${webassembly_files}")
520527
add_header_target("x86-resource-headers" "${x86_files}")
528+
add_header_target("gpu-resource-headers" "${gpu_files}")
521529

522530
# Other header groupings
523531
add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -704,6 +712,12 @@ install(
704712
EXCLUDE_FROM_ALL
705713
COMPONENT x86-resource-headers)
706714

715+
install(
716+
FILES ${gpu_files}
717+
DESTINATION ${header_install_dir}
718+
EXCLUDE_FROM_ALL
719+
COMPONENT gpu-resource-headers)
720+
707721
if(NOT CLANG_ENABLE_HLSL)
708722
set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
709723
endif()

clang/lib/Headers/amdgpuintrin.h

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef __AMDGPUINTRIN_H
10+
#define __AMDGPUINTRIN_H
11+
12+
#ifndef __AMDGPU__
13+
#error "This file is intended for AMDGPU targets or offloading to AMDGPU
14+
#endif
15+
16+
#include <stdbool.h>
17+
#include <stdint.h>
18+
19+
#if defined(__HIP__) || defined(__CUDA__)
20+
#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
21+
#else
22+
#define _DEFAULT_ATTRS __attribute__((always_inline))
23+
#endif
24+
25+
#pragma omp begin declare target device_type(nohost)
26+
#pragma omp begin declare variant match(device = {arch(amdgcn)})
27+
28+
// Type aliases to the address spaces used by the AMDGPU backend.
29+
#define _private __attribute__((opencl_private))
30+
#define _constant __attribute__((opencl_constant))
31+
#define _local __attribute__((opencl_local))
32+
#define _global __attribute__((opencl_global))
33+
34+
// Attribute to declare a function as a kernel.
35+
#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
36+
37+
// Returns the number of workgroups in the 'x' dimension of the grid.
38+
_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
39+
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
40+
}
41+
42+
// Returns the number of workgroups in the 'y' dimension of the grid.
43+
_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
44+
return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
45+
}
46+
47+
// Returns the number of workgroups in the 'z' dimension of the grid.
48+
_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
49+
return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
50+
}
51+
52+
// Returns the total number of workgruops in the grid.
53+
_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
54+
return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
55+
}
56+
57+
// Returns the 'x' dimension of the current AMD workgroup's id.
58+
_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
59+
return __builtin_amdgcn_workgroup_id_x();
60+
}
61+
62+
// Returns the 'y' dimension of the current AMD workgroup's id.
63+
_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
64+
return __builtin_amdgcn_workgroup_id_y();
65+
}
66+
67+
// Returns the 'z' dimension of the current AMD workgroup's id.
68+
_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
69+
return __builtin_amdgcn_workgroup_id_z();
70+
}
71+
72+
// Returns the absolute id of the AMD workgroup.
73+
_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
74+
return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
75+
_get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
76+
}
77+
78+
// Returns the number of workitems in the 'x' dimension.
79+
_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
80+
return __builtin_amdgcn_workgroup_size_x();
81+
}
82+
83+
// Returns the number of workitems in the 'y' dimension.
84+
_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
85+
return __builtin_amdgcn_workgroup_size_y();
86+
}
87+
88+
// Returns the number of workitems in the 'z' dimension.
89+
_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
90+
return __builtin_amdgcn_workgroup_size_z();
91+
}
92+
93+
// Returns the total number of workitems in the workgroup.
94+
_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
95+
return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
96+
}
97+
98+
// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
99+
_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
100+
return __builtin_amdgcn_workitem_id_x();
101+
}
102+
103+
// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
104+
_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
105+
return __builtin_amdgcn_workitem_id_y();
106+
}
107+
108+
// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
109+
_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
110+
return __builtin_amdgcn_workitem_id_z();
111+
}
112+
113+
// Returns the absolute id of the thread in the current AMD workgroup.
114+
_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
115+
return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
116+
_get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
117+
}
118+
119+
// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
120+
// and compilation options.
121+
_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
122+
return __builtin_amdgcn_wavefrontsize();
123+
}
124+
125+
// Returns the id of the thread inside of an AMD wavefront executing together.
126+
_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
127+
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
128+
}
129+
130+
// Returns the bit-mask of active threads in the current wavefront.
131+
_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
132+
return __builtin_amdgcn_read_exec();
133+
}
134+
135+
// Copies the value from the first active thread in the wavefront to the rest.
136+
_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
137+
_broadcast_value(uint64_t, uint32_t x) {
138+
return __builtin_amdgcn_readfirstlane(x);
139+
}
140+
141+
// Returns a bitmask of threads in the current lane for which \p x is true.
142+
_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
143+
_ballot(uint64_t lane_mask, bool x) {
144+
// The lane_mask & gives the nvptx semantics when lane_mask is a subset of
145+
// the active threads
146+
return lane_mask & __builtin_amdgcn_ballot_w64(x);
147+
}
148+
149+
// Waits for all the threads in the block to converge and issues a fence.
150+
_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_threads() {
151+
__builtin_amdgcn_s_barrier();
152+
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
153+
}
154+
155+
// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
156+
_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_lane(uint64_t) {
157+
__builtin_amdgcn_wave_barrier();
158+
}
159+
160+
// Shuffles the the lanes inside the wavefront according to the given index.
161+
_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
162+
_shuffle(uint64_t, uint32_t idx, uint32_t x) {
163+
return __builtin_amdgcn_ds_bpermute(idx << 2, x);
164+
}
165+
166+
// Returns the current value of the GPU's processor clock.
167+
// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
168+
_DEFAULT_ATTRS static inline uint64_t _processor_clock() {
169+
return __builtin_readcyclecounter();
170+
}
171+
172+
// Returns a fixed-frequency timestamp. The actual frequency is dependent on
173+
// the card and can only be queried via the driver.
174+
_DEFAULT_ATTRS static inline uint64_t _fixed_frequency_clock() {
175+
return __builtin_readsteadycounter();
176+
}
177+
178+
// Terminates execution of the associated wavefront.
179+
_DEFAULT_ATTRS [[noreturn]] static inline void _end_program() {
180+
__builtin_amdgcn_endpgm();
181+
}
182+
183+
#pragma omp end declare variant
184+
#pragma omp end declare target
185+
#undef _DEFAULT_ATTRS
186+
187+
#endif // __AMDGPUINTRIN_H

clang/lib/Headers/gpuintrin.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef __GPUINTRIN_H
10+
#define __GPUINTRIN_H
11+
12+
#if defined(__NVPTX__)
13+
#include <nvptxintrin.h>
14+
#elif defined(__AMDGPU__)
15+
#include <amdgpuintrin.h>
16+
#endif
17+
18+
#endif // __GPUINTRIN_H

0 commit comments

Comments
 (0)