|
10 | 10 | #define __AMDGPUINTRIN_H
|
11 | 11 |
|
12 | 12 | #ifndef __AMDGPU__
|
13 |
| -#error "This file is intended for AMDGPU targets or offloading to AMDGPU |
| 13 | +#error "This file is intended for AMDGPU targets or offloading to AMDGPU" |
14 | 14 | #endif
|
15 | 15 |
|
16 | 16 | #include <stdbool.h>
|
17 | 17 | #include <stdint.h>
|
18 | 18 |
|
19 | 19 | #if defined(__HIP__) || defined(__CUDA__)
|
20 |
| -#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline)) |
21 |
| -#else |
22 |
| -#define _DEFAULT_ATTRS __attribute__((always_inline)) |
| 20 | +#define _DEFAULT_ATTRS __attribute__((device)) |
| 21 | +#elif !defined(_DEFAULT_ATTRS) |
| 22 | +#define _DEFAULT_ATTRS |
23 | 23 | #endif
|
24 | 24 |
|
25 | 25 | #pragma omp begin declare target device_type(nohost)
|
26 | 26 | #pragma omp begin declare variant match(device = {arch(amdgcn)})
|
27 | 27 |
|
28 | 28 | // Type aliases to the address spaces used by the AMDGPU backend.
|
29 |
| -#define _private __attribute__((opencl_private)) |
30 |
| -#define _constant __attribute__((opencl_constant)) |
31 |
| -#define _local __attribute__((opencl_local)) |
32 |
| -#define _global __attribute__((opencl_global)) |
| 29 | +#define _Private __attribute__((opencl_private)) |
| 30 | +#define _Constant __attribute__((opencl_constant)) |
| 31 | +#define _Local __attribute__((opencl_local)) |
| 32 | +#define _Global __attribute__((opencl_global)) |
33 | 33 |
|
34 | 34 | // Attribute to declare a function as a kernel.
|
35 |
| -#define _kernel __attribute__((amdgpu_kernel, visibility("protected"))) |
| 35 | +#define _Kernel __attribute__((amdgpu_kernel, visibility("protected"))) |
36 | 36 |
|
37 | 37 | // Returns the number of workgroups in the 'x' dimension of the grid.
|
38 |
| -_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() { |
| 38 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() { |
39 | 39 | return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
|
40 | 40 | }
|
41 | 41 |
|
42 | 42 | // Returns the number of workgroups in the 'y' dimension of the grid.
|
43 |
| -_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() { |
| 43 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() { |
44 | 44 | return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
|
45 | 45 | }
|
46 | 46 |
|
47 | 47 | // Returns the number of workgroups in the 'z' dimension of the grid.
|
48 |
| -_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() { |
| 48 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() { |
49 | 49 | return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
|
50 | 50 | }
|
51 | 51 |
|
52 |
| -// Returns the total number of workgruops in the grid. |
53 |
| -_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() { |
54 |
| - return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z(); |
55 |
| -} |
56 |
| - |
57 | 52 | // Returns the 'x' dimension of the current AMD workgroup's id.
|
58 |
| -_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() { |
| 53 | +_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() { |
59 | 54 | return __builtin_amdgcn_workgroup_id_x();
|
60 | 55 | }
|
61 | 56 |
|
62 | 57 | // Returns the 'y' dimension of the current AMD workgroup's id.
|
63 |
| -_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() { |
| 58 | +_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() { |
64 | 59 | return __builtin_amdgcn_workgroup_id_y();
|
65 | 60 | }
|
66 | 61 |
|
67 | 62 | // Returns the 'z' dimension of the current AMD workgroup's id.
|
68 |
| -_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() { |
| 63 | +_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() { |
69 | 64 | return __builtin_amdgcn_workgroup_id_z();
|
70 | 65 | }
|
71 | 66 |
|
72 |
| -// Returns the absolute id of the AMD workgroup. |
73 |
| -_DEFAULT_ATTRS static inline uint64_t _get_block_id() { |
74 |
| - return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() + |
75 |
| - _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z(); |
76 |
| -} |
77 |
| - |
78 | 67 | // Returns the number of workitems in the 'x' dimension.
|
79 |
| -_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() { |
| 68 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() { |
80 | 69 | return __builtin_amdgcn_workgroup_size_x();
|
81 | 70 | }
|
82 | 71 |
|
83 | 72 | // Returns the number of workitems in the 'y' dimension.
|
84 |
| -_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() { |
| 73 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() { |
85 | 74 | return __builtin_amdgcn_workgroup_size_y();
|
86 | 75 | }
|
87 | 76 |
|
88 | 77 | // Returns the number of workitems in the 'z' dimension.
|
89 |
| -_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() { |
| 78 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() { |
90 | 79 | return __builtin_amdgcn_workgroup_size_z();
|
91 | 80 | }
|
92 | 81 |
|
93 |
| -// Returns the total number of workitems in the workgroup. |
94 |
| -_DEFAULT_ATTRS static inline uint64_t _get_num_threads() { |
95 |
| - return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z(); |
96 |
| -} |
97 |
| - |
98 | 82 | // Returns the 'x' dimension id of the workitem in the current AMD workgroup.
|
99 |
| -_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() { |
| 83 | +_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() { |
100 | 84 | return __builtin_amdgcn_workitem_id_x();
|
101 | 85 | }
|
102 | 86 |
|
103 | 87 | // Returns the 'y' dimension id of the workitem in the current AMD workgroup.
|
104 |
| -_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() { |
| 88 | +_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() { |
105 | 89 | return __builtin_amdgcn_workitem_id_y();
|
106 | 90 | }
|
107 | 91 |
|
108 | 92 | // Returns the 'z' dimension id of the workitem in the current AMD workgroup.
|
109 |
| -_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() { |
| 93 | +_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() { |
110 | 94 | return __builtin_amdgcn_workitem_id_z();
|
111 | 95 | }
|
112 | 96 |
|
113 |
| -// Returns the absolute id of the thread in the current AMD workgroup. |
114 |
| -_DEFAULT_ATTRS static inline uint64_t _get_thread_id() { |
115 |
| - return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() + |
116 |
| - _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z(); |
117 |
| -} |
118 |
| - |
119 | 97 | // Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
|
120 | 98 | // and compilation options.
|
121 |
| -_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { |
| 99 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() { |
122 | 100 | return __builtin_amdgcn_wavefrontsize();
|
123 | 101 | }
|
124 | 102 |
|
125 | 103 | // Returns the id of the thread inside of an AMD wavefront executing together.
|
126 |
| -_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() { |
| 104 | +_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() { |
127 | 105 | return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
|
128 | 106 | }
|
129 | 107 |
|
130 | 108 | // Returns the bit-mask of active threads in the current wavefront.
|
131 |
| -_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() { |
| 109 | +_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() { |
132 | 110 | return __builtin_amdgcn_read_exec();
|
133 | 111 | }
|
134 | 112 |
|
135 | 113 | // Copies the value from the first active thread in the wavefront to the rest.
|
136 | 114 | _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
|
137 |
| -_broadcast_value(uint64_t, uint32_t x) { |
138 |
| - return __builtin_amdgcn_readfirstlane(x); |
| 115 | +__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) { |
| 116 | + return __builtin_amdgcn_readfirstlane(__x); |
139 | 117 | }
|
140 | 118 |
|
141 | 119 | // Returns a bitmask of threads in the current lane for which \p x is true.
|
142 | 120 | _DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
|
143 |
| -_ballot(uint64_t lane_mask, bool x) { |
| 121 | +__gpu_ballot(uint64_t __lane_mask, bool __x) { |
144 | 122 | // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
|
145 | 123 | // the active threads
|
146 |
| - return lane_mask & __builtin_amdgcn_ballot_w64(x); |
| 124 | + return __lane_mask & __builtin_amdgcn_ballot_w64(__x); |
147 | 125 | }
|
148 | 126 |
|
149 | 127 | // Waits for all the threads in the block to converge and issues a fence.
|
150 |
| -_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_threads() { |
| 128 | +_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() { |
151 | 129 | __builtin_amdgcn_s_barrier();
|
152 | 130 | __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
|
153 | 131 | }
|
154 | 132 |
|
155 | 133 | // Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
|
156 |
| -_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_lane(uint64_t) { |
| 134 | +_DEFAULT_ATTRS [[clang::convergent]] static inline void |
| 135 | +__gpu_sync_lane(uint64_t __lane_mask) { |
157 | 136 | __builtin_amdgcn_wave_barrier();
|
158 | 137 | }
|
159 | 138 |
|
160 | 139 | // Shuffles the the lanes inside the wavefront according to the given index.
|
161 | 140 | _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
|
162 |
| -_shuffle(uint64_t, uint32_t idx, uint32_t x) { |
163 |
| - return __builtin_amdgcn_ds_bpermute(idx << 2, x); |
164 |
| -} |
165 |
| - |
166 |
| -// Returns the current value of the GPU's processor clock. |
167 |
| -// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter. |
168 |
| -_DEFAULT_ATTRS static inline uint64_t _processor_clock() { |
169 |
| - return __builtin_readcyclecounter(); |
170 |
| -} |
171 |
| - |
172 |
| -// Returns a fixed-frequency timestamp. The actual frequency is dependent on |
173 |
| -// the card and can only be queried via the driver. |
174 |
| -_DEFAULT_ATTRS static inline uint64_t _fixed_frequency_clock() { |
175 |
| - return __builtin_readsteadycounter(); |
| 141 | +__gpu_shuffle_idx(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { |
| 142 | + return __builtin_amdgcn_ds_bpermute(__idx << 2, __x); |
176 | 143 | }
|
177 | 144 |
|
178 | 145 | // Terminates execution of the associated wavefront.
|
179 |
| -_DEFAULT_ATTRS [[noreturn]] static inline void _end_program() { |
| 146 | +_DEFAULT_ATTRS [[noreturn]] static inline void __gpu_exit() { |
180 | 147 | __builtin_amdgcn_endpgm();
|
181 | 148 | }
|
182 | 149 |
|
183 | 150 | #pragma omp end declare variant
|
184 | 151 | #pragma omp end declare target
|
185 |
| -#undef _DEFAULT_ATTRS |
186 | 152 |
|
187 | 153 | #endif // __AMDGPUINTRIN_H
|
0 commit comments