@@ -1953,358 +1953,10 @@ code `invalid` if a user tries to add them to a graph.
1953
1953
Removing this restriction is something we may look at for future revisions of
1954
1954
`sycl_ext_oneapi_graph`.
1955
1955
1956
- == Examples
1956
+ == Examples and Usage Guide
1957
1957
1958
- [NOTE]
1959
- ====
1960
- The examples below demonstrate intended usage of the extension, but may not be
1961
- compatible with the proof-of-concept implementation, as the proof-of-concept
1962
- implementation is currently under development.
1963
- ====
1964
-
1965
- Examples for demonstrative purposes only, and may leave out details such as how
1966
- input data is set.
1967
-
1968
- === Dot Product
1969
-
1970
- [source,c++]
1971
- ----
1972
- ...
1973
-
1974
- #include <sycl/ext/oneapi/experimental/graph.hpp>
1975
-
1976
- int main() {
1977
- namespace sycl_ext = sycl::ext::oneapi::experimental;
1978
-
1979
- const size_t n = 10;
1980
- float alpha = 1.0f;
1981
- float beta = 2.0f;
1982
- float gamma = 3.0f;
1983
-
1984
- sycl::queue q;
1985
- sycl_ext::command_graph g(q.get_context(), q.get_device());
1986
-
1987
- float *dotp = sycl::malloc_shared<float>(1, q);
1988
- float *x = sycl::malloc_device<float>(n, q);
1989
- float *y = sycl::malloc_device<float>(n, q);
1990
- float *z = sycl::malloc_device<float>(n, q);
1991
-
1992
- // Add commands to the graph to create the following topology.
1993
- //
1994
- // i
1995
- // / \
1996
- // a b
1997
- // \ /
1998
- // c
1999
-
2000
- /* init data on the device */
2001
- auto node_i = g.add([&](sycl::handler& h) {
2002
- h.parallel_for(n, [=](sycl::id<1> it){
2003
- const size_t i = it[0];
2004
- x[i] = 1.0f;
2005
- y[i] = 2.0f;
2006
- z[i] = 3.0f;
2007
- });
2008
- });
2009
-
2010
- auto node_a = g.add([&](sycl::handler& h) {
2011
- h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
2012
- const size_t i = it[0];
2013
- x[i] = alpha * x[i] + beta * y[i];
2014
- });
2015
- }, { sycl_ext::property::node::depends_on(node_i)});
2016
-
2017
- auto node_b = g.add([&](sycl::handler& h) {
2018
- h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
2019
- const size_t i = it[0];
2020
- z[i] = gamma * z[i] + beta * y[i];
2021
- });
2022
- }, { sycl_ext::property::node::depends_on(node_i)});
2023
-
2024
- auto node_c = g.add(
2025
- [&](sycl::handler& h) {
2026
- h.parallel_for(sycl::range<1>{n},
2027
- sycl::reduction(dotp, 0.0f, std::plus()),
2028
- [=](sycl::id<1> it, auto &sum) {
2029
- const size_t i = it[0];
2030
- sum += x[i] * z[i];
2031
- });
2032
- },
2033
- { sycl_ext::property::node::depends_on(node_a, node_b)});
2034
-
2035
- auto exec = g.finalize();
2036
-
2037
- // use queue shortcut for graph submission
2038
- q.ext_oneapi_graph(exec).wait();
2039
-
2040
- // memory can be freed inside or outside the graph
2041
- sycl::free(x, q);
2042
- sycl::free(y, q);
2043
- sycl::free(z, q);
2044
- sycl::free(dotp, q);
2045
-
2046
- return 0;
2047
- }
2048
-
2049
-
2050
- ...
2051
- ----
2052
-
2053
- === Diamond Dependency
2054
-
2055
- The following snippet of code shows how a SYCL `queue` can be put into a
2056
- recording state, which allows a `command_graph` object to be populated by the
2057
- command-groups submitted to the queue. Once the graph is complete, recording
2058
- finishes on the queue to put it back into the default executing state. The
2059
- graph is then finalized so that no more nodes can be added. Lastly, the graph is
2060
- submitted in its entirety for execution via
2061
- `handler::ext_oneapi_graph(command_graph<graph_state::executable>)`.
2062
-
2063
- [source, c++]
2064
- ----
2065
- using namespace sycl;
2066
- namespace sycl_ext = sycl::ext::oneapi::experimental;
2067
-
2068
- queue q{default_selector{}};
2069
-
2070
- // Lifetime of buffers must exceed the lifetime of graphs they are used in.
2071
- buffer<T> bufferA{dataA.data(), range<1>{elements}};
2072
- bufferA.set_write_back(false);
2073
- buffer<T> bufferB{dataB.data(), range<1>{elements}};
2074
- bufferB.set_write_back(false);
2075
- buffer<T> bufferC{dataC.data(), range<1>{elements}};
2076
- bufferC.set_write_back(false);
2077
-
2078
- {
2079
- // New object representing graph of command-groups
2080
- sycl_ext::command_graph graph(q.get_context(), q.get_device(),
2081
- {sycl_ext::property::graph::assume_buffer_outlives_graph{}});
2082
-
2083
-
2084
- // `q` will be put in the recording state where commands are recorded to
2085
- // `graph` rather than submitted for execution immediately.
2086
- graph.begin_recording(q);
2087
-
2088
- // Record commands to `graph` with the following topology.
2089
- //
2090
- // increment_kernel
2091
- // / \
2092
- // A->/ A->\
2093
- // / \
2094
- // add_kernel subtract_kernel
2095
- // \ /
2096
- // B->\ C->/
2097
- // \ /
2098
- // decrement_kernel
2099
-
2100
- q.submit([&](handler& cgh) {
2101
- auto pData = bufferA.get_access<access::mode::read_write>(cgh);
2102
- cgh.parallel_for<increment_kernel>(range<1>(elements),
2103
- [=](item<1> id) { pData[id]++; });
2104
- });
2105
-
2106
- q.submit([&](handler& cgh) {
2107
- auto pData1 = bufferA.get_access<access::mode::read>(cgh);
2108
- auto pData2 = bufferB.get_access<access::mode::read_write>(cgh);
2109
- cgh.parallel_for<add_kernel>(range<1>(elements),
2110
- [=](item<1> id) { pData2[id] += pData1[id]; });
2111
- });
2112
-
2113
- q.submit([&](handler& cgh) {
2114
- auto pData1 = bufferA.get_access<access::mode::read>(cgh);
2115
- auto pData2 = bufferC.get_access<access::mode::read_write>(cgh);
2116
- cgh.parallel_for<subtract_kernel>(
2117
- range<1>(elements), [=](item<1> id) { pData2[id] -= pData1[id]; });
2118
- });
2119
-
2120
- q.submit([&](handler& cgh) {
2121
- auto pData1 = bufferB.get_access<access::mode::read_write>(cgh);
2122
- auto pData2 = bufferC.get_access<access::mode::read_write>(cgh);
2123
- cgh.parallel_for<decrement_kernel>(range<1>(elements), [=](item<1> id) {
2124
- pData1[id]--;
2125
- pData2[id]--;
2126
- });
2127
- });
2128
-
2129
- // queue `q` will be returned to the executing state where commands are
2130
- // submitted immediately for extension.
2131
- graph.end_recording();
2132
-
2133
- // Finalize the modifiable graph to create an executable graph that can be
2134
- // submitted for execution.
2135
- auto exec_graph = graph.finalize();
2136
-
2137
- // Execute graph
2138
- q.submit([&](handler& cgh) {
2139
- cgh.ext_oneapi_graph(exec_graph);
2140
- }).wait();
2141
- }
2142
-
2143
- // Check output using host accessors
2144
- host_accessor hostAccA(bufferA);
2145
- host_accessor hostAccB(bufferB);
2146
- host_accessor hostAccC(bufferC);
2147
-
2148
- ...
2149
- ----
2150
-
2151
- === Dynamic Parameter Update
2152
-
2153
- Example showing a graph with a single kernel node that is created using a kernel
2154
- bundle with `handler::set_args()` and having its node arguments updated.
2155
-
2156
- [source,c++]
2157
- ----
2158
- ...
2159
-
2160
- using namespace sycl;
2161
- namespace sycl_ext = sycl::ext::oneapi::experimental;
2162
-
2163
- queue myQueue;
2164
- auto myContext = myQueue.get_context();
2165
- auto myDevice = myQueue.get_device();
2166
-
2167
- // USM allocations for kernel input/output
2168
- const size_t n = 1024;
2169
- int *ptrX = malloc_shared<int>(n, myQueue);
2170
- int *ptrY = malloc_device<int>(n, myQueue);
2171
-
2172
- int *ptrZ = malloc_shared<int>(n, myQueue);
2173
- int *ptrQ = malloc_device<int>(n, myQueue);
2174
-
2175
- // Kernel loaded from kernel bundle
2176
- const std::vector<kernel_id> builtinKernelIds =
2177
- myDevice.get_info<info::device::built_in_kernel_ids>();
2178
- kernel_bundle<bundle_state::executable> myBundle =
2179
- get_kernel_bundle(myContext, { myDevice }, builtinKernelIds);
2180
- kernel builtinKernel = myBundle.get_kernel(builtinKernelIds[0]);
2181
-
2182
- // Graph containing a two kernels node
2183
- sycl_ext::command_graph myGraph(myContext, myDevice);
2184
-
2185
- int myScalar = 42;
2186
- // Create graph dynamic parameters
2187
- dynamic_parameter dynParamInput(myGraph, ptrX);
2188
- dynamic_parameter dynParamScalar(myGraph, myScalar);
2189
-
2190
- // First node uses ptrX as an input & output parameter, with operand
2191
- // mySclar as another argument.
2192
- node nodeA = myGraph.add([&](handler& cgh) {
2193
- cgh.set_args(dynParamInput, ptrY, dynParamScalar);
2194
- cgh.parallel_for(range {n}, builtinKernel);
2195
- });
2196
-
2197
- // Create an executable graph with the updatable property.
2198
- auto execGraph = myGraph.finalize({sycl_ext::property::graph::updatable});
2199
-
2200
- // Execute graph, then update without needing to wait for it to complete
2201
- myQueue.ext_oneapi_graph(execGraph);
2202
-
2203
- // Change ptrX argument to node A to ptrZ
2204
- dynParamInput.update(ptrZ);
2205
-
2206
- // Change myScalar argument to node A to newScalar
2207
- int newScalar = 12;
2208
- dynParamScalar.update(newScalar);
2209
-
2210
- // Update nodeA in the executable graph with the new parameters
2211
- execGraph.update(nodeA);
2212
- // Execute graph again
2213
- myQueue.ext_oneapi_graph(execGraph);
2214
- myQueue.wait();
2215
-
2216
- sycl::free(ptrX, myQueue);
2217
- sycl::free(ptrY, myQueue);
2218
- sycl::free(ptrZ, myQueue);
2219
- sycl::free(ptrQ, myQueue);
2220
-
2221
- ----
2222
-
2223
- Example snippet showing how to use accessors with `dynamic_parameter` update:
2224
- [source,c++]
2225
- ----
2226
- sycl::buffer bufferA{...};
2227
- sycl::buffer bufferB{...};
2228
-
2229
- // Create graph dynamic parameter using a placeholder accessor, since the
2230
- // sycl::handler is not available here outside of the command-group scope.
2231
- dynamic_parameter dynParamAccessor(myGraph, bufferA.get_access());
2232
-
2233
- node nodeA = myGraph.add([&](handler& cgh) {
2234
- // Require the accessor contained in the dynamic paramter
2235
- cgh.require(dynParamAccessor);
2236
- // Set the arg on the kernel using the dynamic parameter directly
2237
- cgh.set_args(dynParamAccessor);
2238
- cgh.parallel_for(range {n}, builtinKernel);
2239
- });
2240
-
2241
- ...
2242
- // Update the dynamic parameter with a placeholder accessor from bufferB instead
2243
- dynParamAccessor.update(bufferB.get_access());
2244
- ----
2245
-
2246
- === Whole Graph Update
2247
-
2248
- Example that shows recording and updating several nodes with different
2249
- parameters using <<whole-graph-update, Whole Graph Update>>.
2250
-
2251
- [source,c++]
2252
- ----
2253
- ...
2254
- using namespace sycl;
2255
- namespace sycl_ext = sycl::ext::oneapi::experimental;
2256
-
2257
- // Enqueue several kernels which use inputPtr
2258
- void run_kernels(int* inputPtr, queue syclQueue){
2259
- event eventA = syclQueue.submit([&](handler& CGH){
2260
- CGH.parallel_for(...);
2261
- });
2262
- event eventB = syclQueue.submit([&](handler& CGH){
2263
- CGH.depends_on(eventA);
2264
- CGH.parallel_for(...);
2265
- });
2266
- syclQueue.submit([&](handler& CGH){
2267
- CGH.depends_on(eventB);
2268
- CGH.parallel_for(...);
2269
- });
2270
- }
2271
-
2272
- ...
2273
-
2274
- queue myQueue;
2275
-
2276
- // USM allocations
2277
- const size_t n = 1024;
2278
- int *ptrA = malloc_device<int>(n, myQueue);
2279
- int *ptrB = malloc_device<int>(n, myQueue);
2280
-
2281
- // Main graph which will be updated later
2282
- sycl_ext::command_graph mainGraph(myQueue);
2283
-
2284
- // Record the kernels to mainGraph, using ptrA
2285
- mainGraph.begin_recording(myQueue);
2286
- run_kernels(ptrA, myQueue);
2287
- mainGraph.end_recording();
2288
-
2289
- auto execMainGraph = mainGraph.finalize({sycl_ext::property::graph::updatable});
2290
-
2291
- // Execute execMainGraph
2292
- myQueue.ext_oneapi_graph(execMainGraph);
2293
-
2294
- // Record a second graph which records the same kernels, but using ptrB instead
2295
- sycl_ext::command_graph updateGraph(myQueue);
2296
- updateGraph.begin_recording(myQueue);
2297
- run_kernels(ptrB, myQueue);
2298
- updateGraph.end_recording();
2299
-
2300
- // Update execMainGraph using updateGraph. We do not need to finalize
2301
- // updateGraph (this would be expensive)
2302
- execMainGraph.update(updateGraph);
2303
-
2304
- // Execute execMainGraph again, which will now be operating on ptrB instead of
2305
- // ptrA
2306
- myQueue.ext_oneapi_graph(execMainGraph);
2307
- ----
1958
+ Detailed code examples and usage guidelines are provided in the
1959
+ link:../../SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].
2308
1960
2309
1961
== Future Direction [[future-direction]]
2310
1962
0 commit comments