Skip to content

Commit 8e6d451

Browse files
authored
[SYCL][Graph][Doc] Add SYCL-Graph usage guide and example doc (#14965)
- Create new `syclgraph` folder in `doc` for syclgraph related, user-facing documents - Add a new usage guide document with some usage guidelines for common scenarios - Move examples from spec to usage guide to reduce bloat - Remove reductions from examples since they are not supported
1 parent 1d6fc10 commit 8e6d451

File tree

3 files changed

+519
-351
lines changed

3 files changed

+519
-351
lines changed

sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc

Lines changed: 3 additions & 351 deletions
Original file line numberDiff line numberDiff line change
@@ -1953,358 +1953,10 @@ code `invalid` if a user tries to add them to a graph.
19531953
Removing this restriction is something we may look at for future revisions of
19541954
`sycl_ext_oneapi_graph`.
19551955

1956-
== Examples
1956+
== Examples and Usage Guide
19571957

1958-
[NOTE]
1959-
====
1960-
The examples below demonstrate intended usage of the extension, but may not be
1961-
compatible with the proof-of-concept implementation, as the proof-of-concept
1962-
implementation is currently under development.
1963-
====
1964-
1965-
Examples for demonstrative purposes only, and may leave out details such as how
1966-
input data is set.
1967-
1968-
=== Dot Product
1969-
1970-
[source,c++]
1971-
----
1972-
...
1973-
1974-
#include <sycl/ext/oneapi/experimental/graph.hpp>
1975-
1976-
int main() {
1977-
namespace sycl_ext = sycl::ext::oneapi::experimental;
1978-
1979-
const size_t n = 10;
1980-
float alpha = 1.0f;
1981-
float beta = 2.0f;
1982-
float gamma = 3.0f;
1983-
1984-
sycl::queue q;
1985-
sycl_ext::command_graph g(q.get_context(), q.get_device());
1986-
1987-
float *dotp = sycl::malloc_shared<float>(1, q);
1988-
float *x = sycl::malloc_device<float>(n, q);
1989-
float *y = sycl::malloc_device<float>(n, q);
1990-
float *z = sycl::malloc_device<float>(n, q);
1991-
1992-
// Add commands to the graph to create the following topology.
1993-
//
1994-
// i
1995-
// / \
1996-
// a b
1997-
// \ /
1998-
// c
1999-
2000-
/* init data on the device */
2001-
auto node_i = g.add([&](sycl::handler& h) {
2002-
h.parallel_for(n, [=](sycl::id<1> it){
2003-
const size_t i = it[0];
2004-
x[i] = 1.0f;
2005-
y[i] = 2.0f;
2006-
z[i] = 3.0f;
2007-
});
2008-
});
2009-
2010-
auto node_a = g.add([&](sycl::handler& h) {
2011-
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
2012-
const size_t i = it[0];
2013-
x[i] = alpha * x[i] + beta * y[i];
2014-
});
2015-
}, { sycl_ext::property::node::depends_on(node_i)});
2016-
2017-
auto node_b = g.add([&](sycl::handler& h) {
2018-
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
2019-
const size_t i = it[0];
2020-
z[i] = gamma * z[i] + beta * y[i];
2021-
});
2022-
}, { sycl_ext::property::node::depends_on(node_i)});
2023-
2024-
auto node_c = g.add(
2025-
[&](sycl::handler& h) {
2026-
h.parallel_for(sycl::range<1>{n},
2027-
sycl::reduction(dotp, 0.0f, std::plus()),
2028-
[=](sycl::id<1> it, auto &sum) {
2029-
const size_t i = it[0];
2030-
sum += x[i] * z[i];
2031-
});
2032-
},
2033-
{ sycl_ext::property::node::depends_on(node_a, node_b)});
2034-
2035-
auto exec = g.finalize();
2036-
2037-
// use queue shortcut for graph submission
2038-
q.ext_oneapi_graph(exec).wait();
2039-
2040-
// memory can be freed inside or outside the graph
2041-
sycl::free(x, q);
2042-
sycl::free(y, q);
2043-
sycl::free(z, q);
2044-
sycl::free(dotp, q);
2045-
2046-
return 0;
2047-
}
2048-
2049-
2050-
...
2051-
----
2052-
2053-
=== Diamond Dependency
2054-
2055-
The following snippet of code shows how a SYCL `queue` can be put into a
2056-
recording state, which allows a `command_graph` object to be populated by the
2057-
command-groups submitted to the queue. Once the graph is complete, recording
2058-
finishes on the queue to put it back into the default executing state. The
2059-
graph is then finalized so that no more nodes can be added. Lastly, the graph is
2060-
submitted in its entirety for execution via
2061-
`handler::ext_oneapi_graph(command_graph<graph_state::executable>)`.
2062-
2063-
[source, c++]
2064-
----
2065-
using namespace sycl;
2066-
namespace sycl_ext = sycl::ext::oneapi::experimental;
2067-
2068-
queue q{default_selector{}};
2069-
2070-
// Lifetime of buffers must exceed the lifetime of graphs they are used in.
2071-
buffer<T> bufferA{dataA.data(), range<1>{elements}};
2072-
bufferA.set_write_back(false);
2073-
buffer<T> bufferB{dataB.data(), range<1>{elements}};
2074-
bufferB.set_write_back(false);
2075-
buffer<T> bufferC{dataC.data(), range<1>{elements}};
2076-
bufferC.set_write_back(false);
2077-
2078-
{
2079-
// New object representing graph of command-groups
2080-
sycl_ext::command_graph graph(q.get_context(), q.get_device(),
2081-
{sycl_ext::property::graph::assume_buffer_outlives_graph{}});
2082-
2083-
2084-
// `q` will be put in the recording state where commands are recorded to
2085-
// `graph` rather than submitted for execution immediately.
2086-
graph.begin_recording(q);
2087-
2088-
// Record commands to `graph` with the following topology.
2089-
//
2090-
// increment_kernel
2091-
// / \
2092-
// A->/ A->\
2093-
// / \
2094-
// add_kernel subtract_kernel
2095-
// \ /
2096-
// B->\ C->/
2097-
// \ /
2098-
// decrement_kernel
2099-
2100-
q.submit([&](handler& cgh) {
2101-
auto pData = bufferA.get_access<access::mode::read_write>(cgh);
2102-
cgh.parallel_for<increment_kernel>(range<1>(elements),
2103-
[=](item<1> id) { pData[id]++; });
2104-
});
2105-
2106-
q.submit([&](handler& cgh) {
2107-
auto pData1 = bufferA.get_access<access::mode::read>(cgh);
2108-
auto pData2 = bufferB.get_access<access::mode::read_write>(cgh);
2109-
cgh.parallel_for<add_kernel>(range<1>(elements),
2110-
[=](item<1> id) { pData2[id] += pData1[id]; });
2111-
});
2112-
2113-
q.submit([&](handler& cgh) {
2114-
auto pData1 = bufferA.get_access<access::mode::read>(cgh);
2115-
auto pData2 = bufferC.get_access<access::mode::read_write>(cgh);
2116-
cgh.parallel_for<subtract_kernel>(
2117-
range<1>(elements), [=](item<1> id) { pData2[id] -= pData1[id]; });
2118-
});
2119-
2120-
q.submit([&](handler& cgh) {
2121-
auto pData1 = bufferB.get_access<access::mode::read_write>(cgh);
2122-
auto pData2 = bufferC.get_access<access::mode::read_write>(cgh);
2123-
cgh.parallel_for<decrement_kernel>(range<1>(elements), [=](item<1> id) {
2124-
pData1[id]--;
2125-
pData2[id]--;
2126-
});
2127-
});
2128-
2129-
// queue `q` will be returned to the executing state where commands are
2130-
// submitted immediately for extension.
2131-
graph.end_recording();
2132-
2133-
// Finalize the modifiable graph to create an executable graph that can be
2134-
// submitted for execution.
2135-
auto exec_graph = graph.finalize();
2136-
2137-
// Execute graph
2138-
q.submit([&](handler& cgh) {
2139-
cgh.ext_oneapi_graph(exec_graph);
2140-
}).wait();
2141-
}
2142-
2143-
// Check output using host accessors
2144-
host_accessor hostAccA(bufferA);
2145-
host_accessor hostAccB(bufferB);
2146-
host_accessor hostAccC(bufferC);
2147-
2148-
...
2149-
----
2150-
2151-
=== Dynamic Parameter Update
2152-
2153-
Example showing a graph with a single kernel node that is created using a kernel
2154-
bundle with `handler::set_args()` and having its node arguments updated.
2155-
2156-
[source,c++]
2157-
----
2158-
...
2159-
2160-
using namespace sycl;
2161-
namespace sycl_ext = sycl::ext::oneapi::experimental;
2162-
2163-
queue myQueue;
2164-
auto myContext = myQueue.get_context();
2165-
auto myDevice = myQueue.get_device();
2166-
2167-
// USM allocations for kernel input/output
2168-
const size_t n = 1024;
2169-
int *ptrX = malloc_shared<int>(n, myQueue);
2170-
int *ptrY = malloc_device<int>(n, myQueue);
2171-
2172-
int *ptrZ = malloc_shared<int>(n, myQueue);
2173-
int *ptrQ = malloc_device<int>(n, myQueue);
2174-
2175-
// Kernel loaded from kernel bundle
2176-
const std::vector<kernel_id> builtinKernelIds =
2177-
myDevice.get_info<info::device::built_in_kernel_ids>();
2178-
kernel_bundle<bundle_state::executable> myBundle =
2179-
get_kernel_bundle(myContext, { myDevice }, builtinKernelIds);
2180-
kernel builtinKernel = myBundle.get_kernel(builtinKernelIds[0]);
2181-
2182-
// Graph containing a two kernels node
2183-
sycl_ext::command_graph myGraph(myContext, myDevice);
2184-
2185-
int myScalar = 42;
2186-
// Create graph dynamic parameters
2187-
dynamic_parameter dynParamInput(myGraph, ptrX);
2188-
dynamic_parameter dynParamScalar(myGraph, myScalar);
2189-
2190-
// First node uses ptrX as an input & output parameter, with operand
2191-
// mySclar as another argument.
2192-
node nodeA = myGraph.add([&](handler& cgh) {
2193-
cgh.set_args(dynParamInput, ptrY, dynParamScalar);
2194-
cgh.parallel_for(range {n}, builtinKernel);
2195-
});
2196-
2197-
// Create an executable graph with the updatable property.
2198-
auto execGraph = myGraph.finalize({sycl_ext::property::graph::updatable});
2199-
2200-
// Execute graph, then update without needing to wait for it to complete
2201-
myQueue.ext_oneapi_graph(execGraph);
2202-
2203-
// Change ptrX argument to node A to ptrZ
2204-
dynParamInput.update(ptrZ);
2205-
2206-
// Change myScalar argument to node A to newScalar
2207-
int newScalar = 12;
2208-
dynParamScalar.update(newScalar);
2209-
2210-
// Update nodeA in the executable graph with the new parameters
2211-
execGraph.update(nodeA);
2212-
// Execute graph again
2213-
myQueue.ext_oneapi_graph(execGraph);
2214-
myQueue.wait();
2215-
2216-
sycl::free(ptrX, myQueue);
2217-
sycl::free(ptrY, myQueue);
2218-
sycl::free(ptrZ, myQueue);
2219-
sycl::free(ptrQ, myQueue);
2220-
2221-
----
2222-
2223-
Example snippet showing how to use accessors with `dynamic_parameter` update:
2224-
[source,c++]
2225-
----
2226-
sycl::buffer bufferA{...};
2227-
sycl::buffer bufferB{...};
2228-
2229-
// Create graph dynamic parameter using a placeholder accessor, since the
2230-
// sycl::handler is not available here outside of the command-group scope.
2231-
dynamic_parameter dynParamAccessor(myGraph, bufferA.get_access());
2232-
2233-
node nodeA = myGraph.add([&](handler& cgh) {
2234-
// Require the accessor contained in the dynamic paramter
2235-
cgh.require(dynParamAccessor);
2236-
// Set the arg on the kernel using the dynamic parameter directly
2237-
cgh.set_args(dynParamAccessor);
2238-
cgh.parallel_for(range {n}, builtinKernel);
2239-
});
2240-
2241-
...
2242-
// Update the dynamic parameter with a placeholder accessor from bufferB instead
2243-
dynParamAccessor.update(bufferB.get_access());
2244-
----
2245-
2246-
=== Whole Graph Update
2247-
2248-
Example that shows recording and updating several nodes with different
2249-
parameters using <<whole-graph-update, Whole Graph Update>>.
2250-
2251-
[source,c++]
2252-
----
2253-
...
2254-
using namespace sycl;
2255-
namespace sycl_ext = sycl::ext::oneapi::experimental;
2256-
2257-
// Enqueue several kernels which use inputPtr
2258-
void run_kernels(int* inputPtr, queue syclQueue){
2259-
event eventA = syclQueue.submit([&](handler& CGH){
2260-
CGH.parallel_for(...);
2261-
});
2262-
event eventB = syclQueue.submit([&](handler& CGH){
2263-
CGH.depends_on(eventA);
2264-
CGH.parallel_for(...);
2265-
});
2266-
syclQueue.submit([&](handler& CGH){
2267-
CGH.depends_on(eventB);
2268-
CGH.parallel_for(...);
2269-
});
2270-
}
2271-
2272-
...
2273-
2274-
queue myQueue;
2275-
2276-
// USM allocations
2277-
const size_t n = 1024;
2278-
int *ptrA = malloc_device<int>(n, myQueue);
2279-
int *ptrB = malloc_device<int>(n, myQueue);
2280-
2281-
// Main graph which will be updated later
2282-
sycl_ext::command_graph mainGraph(myQueue);
2283-
2284-
// Record the kernels to mainGraph, using ptrA
2285-
mainGraph.begin_recording(myQueue);
2286-
run_kernels(ptrA, myQueue);
2287-
mainGraph.end_recording();
2288-
2289-
auto execMainGraph = mainGraph.finalize({sycl_ext::property::graph::updatable});
2290-
2291-
// Execute execMainGraph
2292-
myQueue.ext_oneapi_graph(execMainGraph);
2293-
2294-
// Record a second graph which records the same kernels, but using ptrB instead
2295-
sycl_ext::command_graph updateGraph(myQueue);
2296-
updateGraph.begin_recording(myQueue);
2297-
run_kernels(ptrB, myQueue);
2298-
updateGraph.end_recording();
2299-
2300-
// Update execMainGraph using updateGraph. We do not need to finalize
2301-
// updateGraph (this would be expensive)
2302-
execMainGraph.update(updateGraph);
2303-
2304-
// Execute execMainGraph again, which will now be operating on ptrB instead of
2305-
// ptrA
2306-
myQueue.ext_oneapi_graph(execMainGraph);
2307-
----
1958+
Detailed code examples and usage guidelines are provided in the
1959+
link:../../SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].
23081960

23091961
== Future Direction [[future-direction]]
23101962

sycl/doc/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Using oneAPI DPC++ for Application Development
1818
User API Reference <https://intel.github.io/llvm-docs/doxygen/group__sycl__api.html>
1919
EnvironmentVariables
2020
MultiTileCardWithLevelZero
21+
syclgraph/SYCLGraphUsageGuide
2122

2223
Design Documents for the oneAPI DPC++ Compiler
2324
----------------------------------------------

0 commit comments

Comments
 (0)