|
| 1 | += sycl_ext_oneapi_work_group_local |
| 2 | + |
| 3 | +:source-highlighter: coderay |
| 4 | +:coderay-linenums-mode: table |
| 5 | + |
| 6 | +// This section needs to be after the document title. |
| 7 | +:doctype: book |
| 8 | +:toc2: |
| 9 | +:toc: left |
| 10 | +:encoding: utf-8 |
| 11 | +:lang: en |
| 12 | +:dpcpp: pass:[DPC++] |
| 13 | + |
| 14 | +// Set the default source code type in this document to C++, |
| 15 | +// for syntax highlighting purposes. This is needed because |
| 16 | +// docbook uses c++ and html5 uses cpp. |
| 17 | +:language: {basebackend@docbook:c++:cpp} |
| 18 | + |
| 19 | + |
| 20 | +== Notice |
| 21 | + |
| 22 | +[%hardbreaks] |
| 23 | +Copyright (C) 2023-2023 Intel Corporation. All rights reserved. |
| 24 | + |
| 25 | +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks |
| 26 | +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by |
| 27 | +permission by Khronos. |
| 28 | + |
| 29 | + |
| 30 | +== Contact |
| 31 | + |
| 32 | +To report problems with this extension, please open a new issue at: |
| 33 | + |
| 34 | +https://github.com/intel/llvm/issues |
| 35 | + |
| 36 | + |
| 37 | +== Dependencies |
| 38 | + |
| 39 | +This extension is written against the SYCL 2020 revision 7 specification. All |
| 40 | +references below to the "core SYCL specification" or to section numbers in the |
| 41 | +SYCL specification refer to that revision. |
| 42 | + |
| 43 | +The following extensions are required only for dynamic allocations: |
| 44 | + |
| 45 | +- link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties] |
| 46 | + |
| 47 | +- link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_kernel_properties] |
| 48 | + |
| 49 | + |
| 50 | +== Status |
| 51 | + |
| 52 | +This is a proposed extension specification, intended to gather community |
| 53 | +feedback. Interfaces defined in this specification may not be implemented yet |
| 54 | +or may be in a preliminary state. The specification itself may also change in |
| 55 | +incompatible ways before it is finalized. *Shipping software products should |
| 56 | +not rely on APIs defined in this specification.* |
| 57 | + |
| 58 | + |
| 59 | +== Overview |
| 60 | + |
| 61 | +This extension defines a `sycl::ext::oneapi::experimental::work_group_local` |
| 62 | +class template with behavior inspired by the {cpp} `thread_local` keyword |
| 63 | +and the CUDA `+__shared__+` keyword. |
| 64 | + |
| 65 | +`work_group_local` variables can be allocated at global or function scope, |
| 66 | +lifting many of the restrictions in the existing |
| 67 | +link:../supported/sycl_ext_oneapi_local_memory.asciidoc[sycl_ext_oneapi_local_memory] |
| 68 | +extension. Note, however, that `work_group_local` variables currently place |
| 69 | +additional limits on the types that can be allocated, owing to differences in |
| 70 | +constructor behavior. |
| 71 | + |
| 72 | + |
| 73 | +== Specification |
| 74 | + |
| 75 | +=== Feature test macro |
| 76 | + |
| 77 | +This extension provides a feature-test macro as described in the core SYCL |
| 78 | +specification. An implementation supporting this extension must predefine the |
| 79 | +macro `SYCL_EXT_ONEAPI_WORK_GROUP_LOCAL` to one of the values defined in the |
| 80 | +table below. Applications can test for the existence of this macro to |
| 81 | +determine if the implementation supports this feature, or applications can test |
| 82 | +the macro's value to determine which of the extension's features the |
| 83 | +implementation supports. |
| 84 | + |
| 85 | +[%header,cols="1,5"] |
| 86 | +|=== |
| 87 | +|Value |
| 88 | +|Description |
| 89 | + |
| 90 | +|1 |
| 91 | +|The APIs of this experimental extension are not versioned, so the |
| 92 | + feature-test macro always has this value. |
| 93 | +|=== |
| 94 | + |
| 95 | + |
| 96 | +=== `work_group_local` class template |
| 97 | + |
| 98 | +The `work_group_local` class template acts as a view of an |
| 99 | +implementation-managed pointer to work-group local memory. |
| 100 | + |
| 101 | +[source,c++] |
| 102 | +---- |
| 103 | +namespace sycl::ext::oneapi::experimental { |
| 104 | +
|
| 105 | +template <typename T> |
| 106 | +class work_group_local { |
| 107 | +public: |
| 108 | +
|
| 109 | + work_group_local() = default; |
| 110 | + work_group_local(const work_group_local&) = delete; |
| 111 | + work_group_local& operator=(const work_group_local&) = delete; |
| 112 | +
|
| 113 | + operator T&() const noexcept; |
| 114 | +
|
| 115 | + // Available only if: std::is_array_v<T> == false |
| 116 | + const work_group_local& operator=(const T& value) const noexcept; |
| 117 | +
|
| 118 | + T* operator&() const noexcept; |
| 119 | +
|
| 120 | +private: |
| 121 | + T* ptr; // exposition only |
| 122 | +
|
| 123 | +}; |
| 124 | +
|
| 125 | +} // namespace sycl::ext::oneapi::experimental |
| 126 | +---- |
| 127 | + |
| 128 | +`T` must be trivially constructible and trivially destructible. |
| 129 | + |
| 130 | +The storage for the object is allocated in work-group local memory before |
| 131 | +calling the user's kernel lambda, and deallocated when all work-items |
| 132 | +in the group have completed execution of the kernel. |
| 133 | + |
| 134 | +SYCL implementations conforming to the full feature set treat |
| 135 | +`work_group_local` similarly to the `thread_local` keyword, and when |
| 136 | +a `work_group_local` object is declared at block scope it behaves |
| 137 | +as if the `static` keyword was specified implicitly. SYCL implementations |
| 138 | +conforming to the reduced feature set require the `static` keyword to be |
| 139 | +specified explicitly. |
| 140 | + |
| 141 | +[NOTE] |
| 142 | +==== |
| 143 | +If a `work_group_local` object is declared at function scope, the work-group |
| 144 | +local memory associated with the object will be identical for all usages of |
| 145 | +that function within the kernel. In cases where a function is called multiple |
| 146 | +times, developers must take care to avoid race conditions (e.g., by calling |
| 147 | +`group_barrier` before and after using the memory). |
| 148 | +==== |
| 149 | + |
| 150 | +SYCL 2020 requires that all global variables accessed by a device function are |
| 151 | +`const` or `constexpr`. This extension lifts that restriction for |
| 152 | +`work_group_local` variables. |
| 153 | + |
| 154 | +[NOTE] |
| 155 | +==== |
| 156 | +Since `work_group_local` acts as a view, wrapping an underlying pointer, a |
| 157 | +developer may still choose to declare variables as `const`. |
| 158 | +==== |
| 159 | + |
| 160 | +When `T` is a class type or bounded array, the size of the allocation is known |
| 161 | +at compile-time, and a SYCL implementation may embed the size of the allocation |
| 162 | +directly within a kernel. Each instance of `work_group_local<T>` is associated |
| 163 | +with a unique allocation in work-group local memory. |
| 164 | + |
| 165 | +When `T` is an unbounded array, the size of the allocation is unknown at |
| 166 | +compile-time, and must be communicated to the SYCL implementation via the |
| 167 | +`work_group_local_memory_size` property. Every instance of `work_group_local` |
| 168 | +for which `T` is an unbounded array is associated with a single, shared, |
| 169 | +allocation in work-group local memory. For example, two instances declared as |
| 170 | +`work_group_local<int[]>` and `work_group_local<float[]>` will be associated |
| 171 | +with the same shared allocation. |
| 172 | + |
| 173 | +If the total amount of local memory requested (i.e., the sum of all memory |
| 174 | +requested by `local_accessor`, `group_local_memory`, |
| 175 | +`group_local_memory_for_overwrite` and `work_group_local`) exceeds a device's |
| 176 | +local memory capacity (as reported by `local_mem_size`) then the implementation |
| 177 | +must throw a synchronous `exception` with the `errc::memory_allocation` error |
| 178 | +code from the kernel invocation command (e.g. `parallel_for`). |
| 179 | + |
| 180 | +[source,c++] |
| 181 | +---- |
| 182 | +operator T&() const noexcept; |
| 183 | +---- |
| 184 | +_Returns_: A reference to the object stored in the work-group local memory |
| 185 | +associated with this instance of `work_group_local`. |
| 186 | + |
| 187 | +[source,c++] |
| 188 | +---- |
| 189 | +const work_group_local<T>& operator=(const T& value) const noexcept; |
| 190 | +---- |
| 191 | +_Constraints_: Available only if `std::is_array_v<T>>` is false. |
| 192 | + |
| 193 | +_Effects_: Replaces the value referenced by `*ptr` with `value`. |
| 194 | + |
| 195 | +_Returns_: A reference to this instance of `work_group_local`. |
| 196 | + |
| 197 | +[source,c++] |
| 198 | +---- |
| 199 | +T* operator&() const noexcept; |
| 200 | +---- |
| 201 | +_Returns_: A pointer to the work-group local memory associated with this |
| 202 | +instance of `work_group_local` (i.e., `ptr`). |
| 203 | + |
| 204 | + |
| 205 | +==== Kernel properties |
| 206 | + |
| 207 | +The `work_group_local_size` property must be passed to a kernel to determine |
| 208 | +the run-time size of the work-group local memory allocation associated with |
| 209 | +all `work_group_local` variables of unbounded array type. |
| 210 | + |
| 211 | +[source,c++] |
| 212 | +---- |
| 213 | +namespace sycl::ext::oneapi::experimental { |
| 214 | +
|
| 215 | +struct work_group_local_size { |
| 216 | + constexpr work_group_local_size(size_t bytes) : value(bytes) {} |
| 217 | + size_t value; |
| 218 | +}; // work_group_local_size |
| 219 | +
|
| 220 | +using work_group_local_size_key = work_group_local_size; |
| 221 | +
|
| 222 | +template <>struct is_property_key<work_group_local_size_key> : std::true_type {}; |
| 223 | +
|
| 224 | +} // namespace sycl::ext::oneapi::experimental |
| 225 | +---- |
| 226 | + |
| 227 | +|=== |
| 228 | +|Property|Description |
| 229 | + |
| 230 | +|`work_group_local_size` |
| 231 | +|The `work_group_local_size` property describes the amount of dynamic |
| 232 | +work-group local memory required by the kernel in bytes. |
| 233 | + |
| 234 | +|=== |
| 235 | + |
| 236 | + |
| 237 | +==== Usage examples |
| 238 | + |
| 239 | +===== Allocations with size known at compile-time |
| 240 | + |
| 241 | +[source,c++] |
| 242 | +---- |
| 243 | +using namespace syclex = sycl::ext::oneapi::experimental; |
| 244 | +
|
| 245 | +/* optional: static const */ syclex::work_group_local<int> program_scope_scalar; |
| 246 | +/* optional: static const */ syclex::work_group_local<int[16]> program_scope_array; |
| 247 | +
|
| 248 | +void foo() { |
| 249 | + /* optional: static const */ syclex::work_group_local<int> function_scope_scalar; |
| 250 | + function_scope_scalar = 1; // assignment via overloaded = operator |
| 251 | + function_scope_scalar += 2; // += operator via implicit conversion to int& |
| 252 | + int* ptr = &function_scope_scalar; // conversion to pointer via overloaded & operator |
| 253 | +} |
| 254 | +
|
| 255 | +void bar() { |
| 256 | + /* optional: static const */ sylex::work_group_local<int[64]> function_scope_array; |
| 257 | + function_scope_array[0] = 1; // [] operator via implicit conversion to int(&)[64] |
| 258 | + int* ptr = function_scope_array; // conversion to pointer via implicit conversion to int(&)[64] |
| 259 | +} |
| 260 | +---- |
| 261 | + |
| 262 | +===== Allocations with size unknown at compile-time |
| 263 | + |
| 264 | +[source,c++] |
| 265 | +---- |
| 266 | +using namespace syclex = sycl::ext::oneapi::experimental; |
| 267 | +
|
| 268 | +/* optional: static const */ syclex::work_group_local<int[]> dynamic_program_scope_array; |
| 269 | +
|
| 270 | +... |
| 271 | +
|
| 272 | +q.parallel_for(sycl::nd_range<1>{N, M}, |
| 273 | + syclex::properties{syclex::work_group_local_size(M * sizeof(int))}, |
| 274 | + [=](sycl::nd_item<1> it) { |
| 275 | + ... |
| 276 | +}); |
| 277 | +---- |
| 278 | + |
| 279 | + |
| 280 | +== Implementation notes |
| 281 | + |
| 282 | +This non-normative section provides information about one possible |
| 283 | +implementation of this extension. It is not part of the specification of the |
| 284 | +extension's API. |
| 285 | + |
| 286 | +For class types and bounded arrays, the class can be implemented on top of |
| 287 | +the existing `__sycl_allocateLocalMemory` intrinsic: |
| 288 | +[source,c++] |
| 289 | +---- |
| 290 | +#ifdef __SYCL_DEVICE_ONLY__ |
| 291 | + __attribute__((opencl_local)) T *ptr = reinterpret_cast<__attribute__((opencl_local)) T *>(__sycl_allocateLocalMemory(sizeof(T), alignof(T))); |
| 292 | +#else |
| 293 | + T *ptr{}; |
| 294 | +#endif |
| 295 | +---- |
| 296 | + |
| 297 | +Note, however, that implementing the correct semantics may require some |
| 298 | +adjustment to the handling of this intrinsic. A simple class as written above |
| 299 | +would create a separate allocation for every call to an inlined function. |
| 300 | +Creating work-group local allocations should be handled before inlining to |
| 301 | +prevent this. |
| 302 | + |
| 303 | +For unbounded arrays, a separate specialization of the class will be required, |
| 304 | +and the implementation may need to generate some additional code to |
| 305 | +appropriately initialize the pointer(s) wrapped by `work_group_local` objects. |
| 306 | +Alternatively, it may be possible to initialize the pointer to the beginning |
| 307 | +of the device's local memory region (if that value is known). Either way, the |
| 308 | +implementation must account for the existence of one or more `local_accessor` |
| 309 | +objects (which themselves may allocate a dynamic amount of work-group local |
| 310 | +memory). |
| 311 | + |
| 312 | + |
| 313 | +== Issues |
| 314 | + |
| 315 | +None. |
0 commit comments