|
| 1 | += sycl_ext_oneapi_joint_for |
| 2 | + |
| 3 | +:source-highlighter: coderay |
| 4 | +:coderay-linenums-mode: table |
| 5 | + |
| 6 | +// This section needs to be after the document title. |
| 7 | +:doctype: book |
| 8 | +:toc2: |
| 9 | +:toc: left |
| 10 | +:encoding: utf-8 |
| 11 | +:lang: en |
| 12 | +:dpcpp: pass:[DPC++] |
| 13 | + |
| 14 | +// Set the default source code type in this document to C++, |
| 15 | +// for syntax highlighting purposes. This is needed because |
| 16 | +// docbook uses c++ and html5 uses cpp. |
| 17 | +:language: {basebackend@docbook:c++:cpp} |
| 18 | + |
| 19 | + |
| 20 | +== Notice |
| 21 | + |
| 22 | +[%hardbreaks] |
| 23 | +Copyright (C) 2024 Intel Corporation. All rights reserved. |
| 24 | + |
| 25 | +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks |
| 26 | +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by |
| 27 | +permission by Khronos. |
| 28 | + |
| 29 | + |
| 30 | +== Contact |
| 31 | + |
| 32 | +To report problems with this extension, please open a new issue at: |
| 33 | + |
| 34 | +https://github.com/intel/llvm/issues |
| 35 | + |
| 36 | + |
| 37 | +== Dependencies |
| 38 | + |
| 39 | +This extension is written against the SYCL 2020 revision 9 specification. All |
| 40 | +references below to the "core SYCL specification" or to section numbers in the |
| 41 | +SYCL specification refer to that revision. |
| 42 | + |
| 43 | + |
| 44 | +== Status |
| 45 | + |
| 46 | +This is a proposed extension specification, intended to gather community |
| 47 | +feedback. Interfaces defined in this specification may not be implemented yet |
| 48 | +or may be in a preliminary state. The specification itself may also change in |
| 49 | +incompatible ways before it is finalized. *Shipping software products should |
| 50 | +not rely on APIs defined in this specification.* |
| 51 | + |
| 52 | + |
| 53 | +== Overview |
| 54 | + |
| 55 | +A very common idiom in SPMD programming sees developers distribute a range over |
| 56 | +the members of a group, by writing a loop that looks like this: |
| 57 | + |
| 58 | +[source,c++] |
| 59 | +---- |
| 60 | +for (int i = it.get_local_id(0); i < N; i += it.get_local_range(0)) { ... } |
| 61 | +---- |
| 62 | + |
| 63 | +The combination of setting the initial value based on the work-item index and |
| 64 | +setting the stride based on the number of work-items in the group ensures that |
| 65 | +any data accesses within the loop are appropriately interleaved to maximize |
| 66 | +performance. |
| 67 | + |
| 68 | +Although common, this idiom has several drawbacks: it is difficult to teach to |
| 69 | +developers unfamiliar with SPMD programming; it is easy to use incorrectly |
| 70 | +(e.g., by forgetting to set a different initial value on each work-item); and |
| 71 | +it is difficult to use generically (i.e., for different groups of work-items). |
| 72 | + |
| 73 | +This extension proposes to encapsulate this pattern in group algorithms, to |
| 74 | +simplify generic usage across different group types. |
| 75 | + |
| 76 | + |
| 77 | +== Specification |
| 78 | + |
| 79 | +=== Feature test macro |
| 80 | + |
| 81 | +This extension provides a feature-test macro as described in the core SYCL |
| 82 | +specification. An implementation supporting this extension must predefine the |
| 83 | +macro `SYCL_EXT_ONEAPI_JOINT_FOR` to one of the values defined in the table |
| 84 | +below. Applications can test for the existence of this macro to determine if |
| 85 | +the implementation supports this feature, or applications can test the macro's |
| 86 | +value to determine which of the extension's features the implementation |
| 87 | +supports. |
| 88 | + |
| 89 | + |
| 90 | +[%header,cols="1,5"] |
| 91 | +|=== |
| 92 | +|Value |
| 93 | +|Description |
| 94 | + |
| 95 | +|1 |
| 96 | +|The APIs of this experimental extension are not versioned, so the |
| 97 | + feature-test macro always has this value. |
| 98 | +|=== |
| 99 | + |
| 100 | + |
| 101 | +=== Group algorithms |
| 102 | + |
| 103 | +[source,c++] |
| 104 | +---- |
| 105 | +namespace sycl::ext::oneapi::experimental { |
| 106 | +
|
| 107 | +template <typename Group, typename InputIterator, typename Function> |
| 108 | +void joint_for_each(Group g, InputIterator first, InputIterator last, Function f); |
| 109 | +
|
| 110 | +} // namespace sycl::ext::oneapi::experimental |
| 111 | +---- |
| 112 | + |
| 113 | +_Constraints_: Available only if `sycl::is_group_v<std::decay_t<Group>>` is |
| 114 | +true and `InputIterator` is a random access iterator. |
| 115 | + |
| 116 | +_Preconditions_: `first` and `last` must be the same for all work-items in |
| 117 | +group `g`, and `f` must be an immutable callable with the same type and state |
| 118 | +for all work-items in group `g`. |
| 119 | + |
| 120 | +_Effects_: Blocks until all work-items in group `g` have |
| 121 | +reached this synchronization point, then applies `f` to the result of |
| 122 | +dereferencing every iterator in the range [`first`, `last`). |
| 123 | +The range is distributed across the work-items in group `g`. |
| 124 | + |
| 125 | +_Synchronization_: The call to this function in each work-item |
| 126 | +happens before the algorithm begins execution. |
| 127 | +The completion of the algorithm happens before any work-item |
| 128 | +blocking on the same synchronization point is unblocked. |
| 129 | + |
| 130 | +_Remarks_: If `f` returns a result, the result is ignored. |
| 131 | +If the range [`first`, `last`) is not evenly divisible by the number of |
| 132 | +work-items in group `g`, the number of times that `f` is invoked will differ |
| 133 | +across work-items. |
| 134 | + |
| 135 | +[source,c++] |
| 136 | +---- |
| 137 | +namespace sycl::ext::oneapi::experimental { |
| 138 | +
|
| 139 | +template <typename Group, typename Integer, typename Function> |
| 140 | +void joint_for(Group g, Integer first, Integer last, Function f); |
| 141 | +
|
| 142 | +} // namespace sycl::ext::oneapi::experimental |
| 143 | +---- |
| 144 | + |
| 145 | +_Constraints_: Available only if `sycl::is_group_v<std::decay_t<Group>>` is |
| 146 | +true and `std::is_integral_v<Integer>` is true. |
| 147 | + |
| 148 | +_Preconditions_: `first` and `last` must be the same for all work-items in |
| 149 | +group `g`, and `f` must be an immutable callable with the same type and state |
| 150 | +for all work-items in group `g`. |
| 151 | + |
| 152 | +_Effects_: Blocks until all work-items in group `g` have |
| 153 | +reached this synchronization point, then applies `f` to every |
| 154 | +integer in the range [`first`, `last`). |
| 155 | +The range is distributed across the work-items in group `g`. |
| 156 | + |
| 157 | +_Synchronization_: The call to this function in each work-item |
| 158 | +happens before the algorithm begins execution. |
| 159 | +The completion of the algorithm happens before any work-item |
| 160 | +blocking on the same synchronization point is unblocked. |
| 161 | + |
| 162 | +_Remarks_: If `f` returns a result, the result is ignored. |
| 163 | +If the range [`first`, `last`) is not evenly divisible by the number of |
| 164 | +work-items in group `g`, the number of times that `f` is invoked will differ |
| 165 | +across work-items. |
| 166 | + |
| 167 | + |
| 168 | +=== Usage example |
| 169 | + |
| 170 | +Both `joint_for_each` and `joint_for` can be used to distribute a loop over the |
| 171 | +work-items within a group. |
| 172 | + |
| 173 | +`joint_for_each` can be used to iterate over (and potentially modify) data when |
| 174 | +the range can be described using iterators: |
| 175 | + |
| 176 | +[source,c++] |
| 177 | +---- |
| 178 | +using syclex = sycl::ext::oneapi::experimental; |
| 179 | +
|
| 180 | +void scale(sycl::queue q, float* data, int N, float factor) { |
| 181 | + q.parallel_for(sycl::nd_range<1>(G, L), [=](sycl::nd_item<1> it) { |
| 182 | +
|
| 183 | + // identify the data this group will operate on |
| 184 | + auto g = it.get_group(); |
| 185 | + int per_group = N / g.get_group_linear_range(); |
| 186 | + float* offset = data + per_group * g.get_group_linear_id(); |
| 187 | +
|
| 188 | + // distribute the data over work-items in the group |
| 189 | + syclex::joint_for_each(g, offset, offset + per_group, [=](float& x) { |
| 190 | + x *= factor; |
| 191 | + }); |
| 192 | +
|
| 193 | + }); |
| 194 | +} |
| 195 | +---- |
| 196 | + |
| 197 | +`joint_for` can be used when iterators are not available, and is a shortcut for |
| 198 | +migrating integer-based `for` loops: |
| 199 | + |
| 200 | +[source,c++] |
| 201 | +---- |
| 202 | +using syclex = sycl::ext::oneapi::experimental; |
| 203 | +
|
| 204 | +void scale(sycl::queue q, float* data, int N, float factor) { |
| 205 | + q.parallel_for(sycl::nd_range<1>(G, L), [=](sycl::nd_item<1> it) { |
| 206 | +
|
| 207 | + // identify the data this group will operate on |
| 208 | + auto g = it.get_group(); |
| 209 | + int per_group = N / g.get_group_linear_range(); |
| 210 | + float* offset = data + per_group * g.get_group_linear_id(); |
| 211 | +
|
| 212 | + // distribute the data over work-items in the group |
| 213 | + syclex::joint_for(g, 0, per_group, [=](int i) { |
| 214 | + offset[i] *= factor; |
| 215 | + }); |
| 216 | +
|
| 217 | + }); |
| 218 | +} |
| 219 | +---- |
| 220 | + |
| 221 | + |
| 222 | +== Implementation notes |
| 223 | + |
| 224 | +This non-normative section provides information about one possible |
| 225 | +implementation of this extension. It is not part of the specification of the |
| 226 | +extension's API. |
| 227 | + |
| 228 | +A simple sketch of a possible implementation of this extension, which does not |
| 229 | +include the complexities of SFINAE and robust error checking, is given below: |
| 230 | + |
| 231 | +[source,c++] |
| 232 | +---- |
| 233 | +template <typename Group, typename InputIterator, typename Function> |
| 234 | +void joint_for_each(Group g, InputIterator first, InputIterator last, Function f) { |
| 235 | + sycl::group_barrier(g); |
| 236 | + typename std::iterator_traits<InputIterator>::difference_type offset = g.get_local_linear_id(); |
| 237 | + typename std::iterator_traits<InputIterator>::difference_type stride = g:get_local_linear_range(); |
| 238 | + for (InputIterator p = first + offset; p < last; p += stride) { |
| 239 | + f(*p); |
| 240 | + } |
| 241 | + sycl::group_barrier(g); |
| 242 | + return f; |
| 243 | +} |
| 244 | +
|
| 245 | +template <typename Group, typename Integer, typename Function> |
| 246 | +std::enable_if_t<std::is_integral_v<Integer>> |
| 247 | +joint_for(Group g, Integer first, Integer last, Function f) { |
| 248 | + sycl::group_barrier(g); |
| 249 | + Integer offset = g.get_local_linear_id(); |
| 250 | + Integer stride = g.get_local_linear_range(); |
| 251 | + for (Integer p = first + offset; p < last; p += stride) { |
| 252 | + f(p); |
| 253 | + } |
| 254 | + sycl::group_barrier(g); |
| 255 | + return f; |
| 256 | +} |
| 257 | +---- |
| 258 | + |
| 259 | +== Issues |
| 260 | + |
| 261 | +None. |
0 commit comments