|
| 1 | += sycl_ext_oneapi_user_defined_reductions |
| 2 | + |
| 3 | +:source-highlighter: coderay |
| 4 | +:coderay-linenums-mode: table |
| 5 | + |
| 6 | +// This section needs to be after the document title. |
| 7 | +:doctype: book |
| 8 | +:toc2: |
| 9 | +:toc: left |
| 10 | +:encoding: utf-8 |
| 11 | +:lang: en |
| 12 | +:dpcpp: pass:[DPC++] |
| 13 | + |
| 14 | +// Set the default source code type in this document to C++, |
| 15 | +// for syntax highlighting purposes. This is needed because |
| 16 | +// docbook uses c++ and html5 uses cpp. |
| 17 | +:language: {basebackend@docbook:c++:cpp} |
| 18 | + |
| 19 | +== Notice |
| 20 | + |
| 21 | +[%hardbreaks] |
| 22 | +Copyright (C) 2022 Intel Corporation. All rights reserved. |
| 23 | + |
| 24 | +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks |
| 25 | +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by |
| 26 | +permission by Khronos. |
| 27 | + |
| 28 | +== Contact |
| 29 | + |
| 30 | +To report problems with this extension, please open a new issue at: |
| 31 | + |
| 32 | +https://github.com/intel/llvm/issues |
| 33 | + |
| 34 | +== Dependencies |
| 35 | + |
| 36 | +This extension is written against the SYCL 2020 revision 5 specification. All |
| 37 | +references below to the "core SYCL specification" or to section numbers in the |
| 38 | +SYCL specification refer to that revision. |
| 39 | + |
| 40 | +This extension also depends on the following other SYCL extensions: |
| 41 | + |
| 42 | +* link:../experimental/sycl_ext_oneapi_group_sort.asciidoc[ |
| 43 | + sycl_ext_oneapi_group_sort] |
| 44 | + |
| 45 | +== Status |
| 46 | + |
| 47 | +This is a proposed extension specification, intended to gather community |
| 48 | +feedback. Interfaces defined in this specification may not be implemented yet |
| 49 | +or may be in a preliminary state. The specification itself may also change in |
| 50 | +incompatible ways before it is finalized. *Shipping software products should |
| 51 | +not rely on APIs defined in this specification.* |
| 52 | + |
| 53 | +== Overview |
| 54 | + |
| 55 | +The purpose of this extension is to expand functionality of `reduce_over_group` |
| 56 | +and `joint_reduce` free functions defined in section 4.17.4.5. `reduce` of the |
| 57 | +core SYCL specification by allowing user-defined binary operators and |
| 58 | +non-fundamental types. |
| 59 | + |
| 60 | +== Specification |
| 61 | + |
| 62 | +=== Feature test macro |
| 63 | + |
| 64 | +This extension provides a feature-test macro as described in the core SYCL |
| 65 | +specification section 6.3.3 Feature test macros. Therefore, an implementation |
| 66 | +supporting this extension must predefine the macro |
| 67 | +`SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS` to one of the values defined in the |
| 68 | +table below. |
| 69 | +Application can test for existence of this macro to determine if the |
| 70 | +implementation supports this feature, or applications can test the macro's value |
| 71 | +to determine which of the extensions's APIs the implementation supports. |
| 72 | + |
| 73 | +Table 1. Values of the `SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS` macro. |
| 74 | +[%header,cols="1,5"] |
| 75 | +|=== |
| 76 | +|Value |Description |
| 77 | +|1 |Initial extension version. Base features are supported. |
| 78 | +|=== |
| 79 | + |
| 80 | +=== Reduction functions |
| 81 | + |
| 82 | +This extension provides two overloads of `reduce_over_group` defined by the core |
| 83 | +SYCL specification. |
| 84 | + |
| 85 | +[source,c++] |
| 86 | +---- |
| 87 | +namespace sycl::ext::oneapi::experimental { |
| 88 | +
|
| 89 | + template <typename GroupHelper, typename Ptr, typename BinaryOperation> |
| 90 | + std::iterator_traits<Ptr>::value_type joint_reduce(GroupHelper g, Ptr first, Ptr last, BinaryOperation binary_op); // (1) |
| 91 | +
|
| 92 | + template <typename GroupHelper, typename Ptr, typename T, typename BinaryOperation> |
| 93 | + T joint_reduce(GroupHelper g, Ptr first, Ptr last, T init, BinaryOperation binary_op); // (2) |
| 94 | +
|
| 95 | + template <typename GroupHelper, typename T, typename BinaryOperation> |
| 96 | + T reduce_over_group(GroupHelper g, T x, BinaryOperation binary_op); // (3) |
| 97 | +
|
| 98 | + template <typename GroupHelper, typename V, typename T, typename BinaryOperation> |
| 99 | + T reduce_over_group(GroupHelper g, V x, T init, BinaryOperation binary_op); // (4) |
| 100 | +} |
| 101 | +---- |
| 102 | + |
| 103 | +1._Constraints_: Available only when `is_group_helper<GroupHelper>` evaluates to `true`. |
| 104 | +The behavior of this trait is defined in link:../experimental/sycl_ext_oneapi_group_sort.asciidoc[sycl_ext_oneapi_group_sort]. |
| 105 | + |
| 106 | +_Mandates_: `binary_op(*first, *first)` must return a value of type |
| 107 | +`std::iterator_traits<Ptr>::value_type`. |
| 108 | + |
| 109 | +_Preconditions_: `first`, `last` and the type of `binary_op` must be the same |
| 110 | +for all work-items in the group. `binary_op` must be an instance of a function |
| 111 | +object. |
| 112 | +The size of memory contained by `GroupHelper` object `g` must |
| 113 | +be at least `sizeof(T) * g.get_group().get_local_range().size()` bytes. |
| 114 | +`binary_op` must be an instance of a function object. |
| 115 | + |
| 116 | +_Returns_: The result of combining the values resulting from dereferencing all |
| 117 | +iterators in the range `[first, last)` using the operator `binary_op`, where the |
| 118 | +values are combined according to the generalized sum defined in standard C++. |
| 119 | + |
| 120 | +NOTE: If `T` is a fundamental type and `BinaryOperation` is a SYCL function |
| 121 | +object type, then memory attached to `GroupHelper` object `g` is not used and |
| 122 | +the call to this overload is equivalent to calling |
| 123 | +`sycl::joint_reduce(g.get_group(), first, last, binary_op)`. |
| 124 | + |
| 125 | +2._Constraints_: Available only when `is_group_helper<GroupHelper>` evaluates to `true`. |
| 126 | +The behavior of this trait is defined in link:../experimental/sycl_ext_oneapi_group_sort.asciidoc[sycl_ext_oneapi_group_sort]. |
| 127 | + |
| 128 | +_Mandates_: `binary_op(init, *first)` must return a value of type `T`. `T` must |
| 129 | +satisfy MoveConstructible requirement. |
| 130 | + |
| 131 | +_Preconditions_: `first`, `last`, `init` and the type of `binary_op` must be the |
| 132 | +same for all work-items in the group. `binary_op` must be an instance of a |
| 133 | +function object. |
| 134 | +The size of memory contained by `GroupHelper` object `g` must |
| 135 | +be at least `sizeof(T) * g.get_group().get_local_range().size()` bytes. |
| 136 | +`binary_op` must be an instance of a function object. |
| 137 | + |
| 138 | +_Returns_: The result of combining the values resulting from dereferencing all |
| 139 | +iterators in the range `[first, last)` and the initial value `init` using the |
| 140 | +operator `binary_op`, where the values are combined according to the generalized |
| 141 | +sum defined in standard C++. |
| 142 | + |
| 143 | +3._Constraints_: Available only when `is_group_helper<GroupHelper>` evaluates to `true`. |
| 144 | +The behavior of this trait is defined in link:../experimental/sycl_ext_oneapi_group_sort.asciidoc[sycl_ext_oneapi_group_sort]. |
| 145 | + |
| 146 | +_Mandates_: `binary_op(x, x)` must return a value of type `T`. |
| 147 | + |
| 148 | +_Preconditions_: The size of memory contained by `GroupHelper` object `g` must |
| 149 | +be at least `sizeof(T) * g.get_group().get_local_range().size()` bytes. |
| 150 | +`binary_op` must be an instance of a function object. |
| 151 | + |
| 152 | +_Returns_: The result of combining all the values of `x` specified by each |
| 153 | +work-item in the group using the operator `binary_op`, where the values are |
| 154 | +combined according to the generalized sum defined in standard C++. |
| 155 | + |
| 156 | +NOTE: If `T` is a fundamental type and `BinaryOperation` is a SYCL function |
| 157 | +object type, then memory attached to `GroupHelper` object `g` is not used and |
| 158 | +the call to this overload is equivalent to calling |
| 159 | +`sycl::reduce_over_group(g.get_group(), x, binary_op)`. |
| 160 | + |
| 161 | +4._Constraints_: Available only when `is_group_helper<GroupHelper>` evaluates to `true`. |
| 162 | +The behavior of this trait is defined in link:../experimental/sycl_ext_oneapi_group_sort.asciidoc[sycl_ext_oneapi_group_sort]. |
| 163 | + |
| 164 | +_Mandates_: `binary_op(init, x)` and `binary_op(x, x)` must return a value of |
| 165 | +type `T`. |
| 166 | + |
| 167 | +_Preconditions_: The size of memory contained by `GroupHelper` object `g` must |
| 168 | +be at least `sizeof(T) * g.get_group().get_local_range().size()` bytes. |
| 169 | +`binary_op` must be an instance of a function object. |
| 170 | + |
| 171 | +_Returns_: The result of combining all the values of `x` specified by each |
| 172 | +work-item in the group and the initial value `init` using the operator |
| 173 | +`binary_op`, where the values are combined according to the generalized sum |
| 174 | +defined in standard C++. |
| 175 | + |
| 176 | +NOTE: If `T` and `V` are fundamental types and `BinaryOperation` is a SYCL |
| 177 | +function object type, then memory attached to `GroupHelper` object `g` is not |
| 178 | +used and the call to this overload is equivalent to calling |
| 179 | +`sycl::reduce_over_group(g.get_group(), x, init, binary_op)`. |
| 180 | + |
| 181 | +NOTE: Implementation of all overaloads may use less memory than passed |
| 182 | +to the function depending on the exact algorithm which is used for doing the |
| 183 | +reduction. |
| 184 | + |
| 185 | +== Example usage |
| 186 | + |
| 187 | +[source,c++] |
| 188 | +---- |
| 189 | +template <typename T> |
| 190 | +struct UserDefinedSum { |
| 191 | + T operator()(T a, T b) { |
| 192 | + return a + b; |
| 193 | + } |
| 194 | +}; |
| 195 | +
|
| 196 | +q.submit([&](sycl::handler& h) { |
| 197 | + auto acc = sycl::accessor(buf, h); |
| 198 | +
|
| 199 | + constexpr size_t group_size = 256; |
| 200 | +
|
| 201 | + // Create enough local memory for the algorithm |
| 202 | + size_t temp_memory_size = group_size * sizeof(T); |
| 203 | + auto scratch = sycl::local_accessor<std::byte, 1>(temp_memory_size, h); |
| 204 | +
|
| 205 | + h.parallel_for(sycl::nd_range<1>{N, group_size}, [=](sycl::nd_item<1> it) { |
| 206 | + // Create a handle that associates the group with an allocation it can use |
| 207 | + auto handle = sycl::ext::oneapi::experimental::group_with_scratchpad( |
| 208 | + it.get_group(), sycl::span(&scratch[0], temp_memory_size)); |
| 209 | +
|
| 210 | + // Pass the handle as the first argument to the group algorithm |
| 211 | + T sum = sycl::ext::oneapi::experimental::reduce_over_group( |
| 212 | + handle, acc[it.get_global_id(0)], 0, UserDefinedSum<T>{}); |
| 213 | +
|
| 214 | + }); |
| 215 | +}); |
| 216 | +---- |
| 217 | + |
| 218 | +== Issues |
| 219 | + |
| 220 | +Open: |
| 221 | + |
| 222 | +. In future versions of this extension we may add a query function which would |
| 223 | +help to calculate the exact amount of memory needed for doing the reduction. |
0 commit comments