[SYCL][Graph][Doc] Add SYCL-Graph usage guide and example doc (#14965)

Bensuo · web-flow · commit 8e6d451e4054 · 2024-08-07T12:44:51.000+01:00
- Create new `syclgraph` folder in `doc` for syclgraph related,
user-facing documents
- Add a new usage guide document with some usage guidelines for common
scenarios
- Move examples from spec to usage guide to reduce bloat
- Remove reductions from examples since they are not supported
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
@@ -1953,358 +1953,10 @@ code `invalid` if a user tries to add them to a graph.
 Removing this restriction is something we may look at for future revisions of
 `sycl_ext_oneapi_graph`.
 
-== Examples
+== Examples and Usage Guide
 
-[NOTE]
-====
-The examples below demonstrate intended usage of the extension, but may not be
-compatible with the proof-of-concept implementation, as the proof-of-concept
-implementation is currently under development.
-====
-
-Examples for demonstrative purposes only, and may leave out details such as how
-input data is set.
-
-=== Dot Product
-
-[source,c++]
-----
-...
-
-#include <sycl/ext/oneapi/experimental/graph.hpp>
-
-int main() {
-  namespace sycl_ext = sycl::ext::oneapi::experimental;
-
-  const size_t n = 10;
-  float alpha = 1.0f;
-  float beta = 2.0f;
-  float gamma = 3.0f;
-
-  sycl::queue q;
-  sycl_ext::command_graph g(q.get_context(), q.get_device());
-
-  float *dotp = sycl::malloc_shared<float>(1, q);
-  float *x = sycl::malloc_device<float>(n, q);
-  float *y = sycl::malloc_device<float>(n, q);
-  float *z = sycl::malloc_device<float>(n, q);
-
-  // Add commands to the graph to create the following topology.
-  //
-  //     i
-  //    / \
-  //   a   b
-  //    \ /
-  //     c
-
-  /* init data on the device */
-  auto node_i = g.add([&](sycl::handler& h) {
-    h.parallel_for(n, [=](sycl::id<1> it){
-      const size_t i = it[0];
-      x[i] = 1.0f;
-      y[i] = 2.0f;
-      z[i] = 3.0f;
-    });
-  });
-
-  auto node_a = g.add([&](sycl::handler& h) {
-    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
-      const size_t i = it[0];
-      x[i] = alpha * x[i] + beta * y[i];
-    });
-  }, { sycl_ext::property::node::depends_on(node_i)});
-
-  auto node_b = g.add([&](sycl::handler& h) {
-    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
-      const size_t i = it[0];
-      z[i] = gamma * z[i] + beta * y[i];
-    });
-  }, { sycl_ext::property::node::depends_on(node_i)});
-
-  auto node_c = g.add(
-      [&](sycl::handler& h) {
-        h.parallel_for(sycl::range<1>{n},
-                       sycl::reduction(dotp, 0.0f, std::plus()),
-                       [=](sycl::id<1> it, auto &sum) {
-                         const size_t i = it[0];
-                         sum += x[i] * z[i];
-                       });
-      },
-      { sycl_ext::property::node::depends_on(node_a, node_b)});
-
-  auto exec = g.finalize();
-
-  // use queue shortcut for graph submission
-  q.ext_oneapi_graph(exec).wait();
-
-  // memory can be freed inside or outside the graph
-  sycl::free(x, q);
-  sycl::free(y, q);
-  sycl::free(z, q);
-  sycl::free(dotp, q);
-
-  return 0;
-}
-
-
-...
-----
-
-=== Diamond Dependency
-
-The following snippet of code shows how a SYCL `queue` can be put into a
-recording state, which allows a `command_graph` object to be populated by the
-command-groups submitted to the queue. Once the graph is complete, recording
-finishes on the queue to put it back into the default executing state. The
-graph is then finalized so that no more nodes can be added. Lastly, the graph is
-submitted in its entirety for execution via
-`handler::ext_oneapi_graph(command_graph<graph_state::executable>)`.
-
-[source, c++]
-----
-  using namespace sycl;
-  namespace sycl_ext = sycl::ext::oneapi::experimental;
-
-  queue q{default_selector{}};
-
-  // Lifetime of buffers must exceed the lifetime of graphs they are used in.
-  buffer<T> bufferA{dataA.data(), range<1>{elements}};
-  bufferA.set_write_back(false);
-  buffer<T> bufferB{dataB.data(), range<1>{elements}};
-  bufferB.set_write_back(false);
-  buffer<T> bufferC{dataC.data(), range<1>{elements}};
-  bufferC.set_write_back(false);
-
-  {
-    // New object representing graph of command-groups
-    sycl_ext::command_graph graph(q.get_context(), q.get_device(),
-          {sycl_ext::property::graph::assume_buffer_outlives_graph{}});
-
-
-    // `q` will be put in the recording state where commands are recorded to
-    // `graph` rather than submitted for execution immediately.
-    graph.begin_recording(q);
-
-    // Record commands to `graph` with the following topology.
-    //
-    //      increment_kernel
-    //       /         \
-    //   A->/        A->\
-    //     /             \
-    //   add_kernel  subtract_kernel
-    //     \             /
-    //   B->\        C->/
-    //       \         /
-    //     decrement_kernel
-
-    q.submit([&](handler& cgh) {
-      auto pData = bufferA.get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<increment_kernel>(range<1>(elements),
-                                         [=](item<1> id) { pData[id]++; });
-    });
-
-    q.submit([&](handler& cgh) {
-      auto pData1 = bufferA.get_access<access::mode::read>(cgh);
-      auto pData2 = bufferB.get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<add_kernel>(range<1>(elements),
-                                   [=](item<1> id) { pData2[id] += pData1[id]; });
-    });
-
-    q.submit([&](handler& cgh) {
-      auto pData1 = bufferA.get_access<access::mode::read>(cgh);
-      auto pData2 = bufferC.get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<subtract_kernel>(
-          range<1>(elements), [=](item<1> id) { pData2[id] -= pData1[id]; });
-    });
-
-    q.submit([&](handler& cgh) {
-      auto pData1 = bufferB.get_access<access::mode::read_write>(cgh);
-      auto pData2 = bufferC.get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<decrement_kernel>(range<1>(elements), [=](item<1> id) {
-        pData1[id]--;
-        pData2[id]--;
-      });
-    });
-
-    // queue `q` will be returned to the executing state where commands are
-    // submitted immediately for extension.
-    graph.end_recording();
-
-    // Finalize the modifiable graph to create an executable graph that can be
-    // submitted for execution.
-    auto exec_graph = graph.finalize();
-
-    // Execute graph
-    q.submit([&](handler& cgh) {
-      cgh.ext_oneapi_graph(exec_graph);
-    }).wait();
-  }
-
-  // Check output using host accessors
-  host_accessor hostAccA(bufferA);
-  host_accessor hostAccB(bufferB);
-  host_accessor hostAccC(bufferC);
-
-  ...
-----
-
-=== Dynamic Parameter Update
-
-Example showing a graph with a single kernel node that is created using a kernel
-bundle with `handler::set_args()` and having its node arguments updated.
-
-[source,c++]
-----
-...
-
-using namespace sycl;
-namespace sycl_ext = sycl::ext::oneapi::experimental;
-
-queue myQueue;
-auto myContext = myQueue.get_context();
-auto myDevice = myQueue.get_device();
-
-// USM allocations for kernel input/output
-const size_t n = 1024;
-int *ptrX = malloc_shared<int>(n, myQueue);
-int *ptrY = malloc_device<int>(n, myQueue);
-
-int *ptrZ = malloc_shared<int>(n, myQueue);
-int *ptrQ = malloc_device<int>(n, myQueue);
-
-// Kernel loaded from kernel bundle
-const std::vector<kernel_id> builtinKernelIds =
-      myDevice.get_info<info::device::built_in_kernel_ids>();
-kernel_bundle<bundle_state::executable> myBundle =
-      get_kernel_bundle(myContext, { myDevice }, builtinKernelIds);
-kernel builtinKernel = myBundle.get_kernel(builtinKernelIds[0]);
-
-// Graph containing a two kernels node
-sycl_ext::command_graph myGraph(myContext, myDevice);
-
-int myScalar = 42;
-// Create graph dynamic parameters
-dynamic_parameter dynParamInput(myGraph, ptrX);
-dynamic_parameter dynParamScalar(myGraph, myScalar);
-
-// First node uses ptrX as an input & output parameter, with operand
-// mySclar as another argument.
-node nodeA = myGraph.add([&](handler& cgh) {
-    cgh.set_args(dynParamInput, ptrY, dynParamScalar);
-    cgh.parallel_for(range {n}, builtinKernel);
-});
-
-// Create an executable graph with the updatable property.
-auto execGraph = myGraph.finalize({sycl_ext::property::graph::updatable});
-
-// Execute graph, then update without needing to wait for it to complete
-myQueue.ext_oneapi_graph(execGraph);
-
-// Change ptrX argument to node A to ptrZ
-dynParamInput.update(ptrZ);
-
-// Change myScalar argument to node A to newScalar
-int newScalar = 12;
-dynParamScalar.update(newScalar);
-
-// Update nodeA in the executable graph with the new parameters
-execGraph.update(nodeA);
-// Execute graph again
-myQueue.ext_oneapi_graph(execGraph);
-myQueue.wait();
-
-sycl::free(ptrX, myQueue);
-sycl::free(ptrY, myQueue);
-sycl::free(ptrZ, myQueue);
-sycl::free(ptrQ, myQueue);
-
-----
-
-Example snippet showing how to use accessors with `dynamic_parameter` update:
-[source,c++]
-----
-sycl::buffer bufferA{...};
-sycl::buffer bufferB{...};
-
-// Create graph dynamic parameter using a placeholder accessor, since the
-// sycl::handler is not available here outside of the command-group scope.
-dynamic_parameter dynParamAccessor(myGraph, bufferA.get_access());
-
-node nodeA = myGraph.add([&](handler& cgh) {
-    // Require the accessor contained in the dynamic paramter
-    cgh.require(dynParamAccessor);
-    // Set the arg on the kernel using the dynamic parameter directly
-    cgh.set_args(dynParamAccessor);
-    cgh.parallel_for(range {n}, builtinKernel);
-});
-
-...
-// Update the dynamic parameter with a placeholder accessor from bufferB instead
-dynParamAccessor.update(bufferB.get_access());
-----
-
-=== Whole Graph Update
-
-Example that shows recording and updating several nodes with different
-parameters using <<whole-graph-update, Whole Graph Update>>.
-
-[source,c++]
-----
-...
-using namespace sycl;
-namespace sycl_ext = sycl::ext::oneapi::experimental;
-
-// Enqueue several kernels which use inputPtr
-void run_kernels(int* inputPtr, queue syclQueue){
-  event eventA = syclQueue.submit([&](handler& CGH){
-    CGH.parallel_for(...);
-  });
-  event eventB = syclQueue.submit([&](handler& CGH){
-    CGH.depends_on(eventA);
-    CGH.parallel_for(...);
-  });
-  syclQueue.submit([&](handler& CGH){
-    CGH.depends_on(eventB);
-    CGH.parallel_for(...);
-  });
-}
-
-...
-
-queue myQueue;
-
-// USM allocations
-const size_t n = 1024;
-int *ptrA = malloc_device<int>(n, myQueue);
-int *ptrB = malloc_device<int>(n, myQueue);
-
-// Main graph which will be updated later
-sycl_ext::command_graph mainGraph(myQueue);
-
-// Record the kernels to mainGraph, using ptrA
-mainGraph.begin_recording(myQueue);
-run_kernels(ptrA, myQueue);
-mainGraph.end_recording();
-
-auto execMainGraph = mainGraph.finalize({sycl_ext::property::graph::updatable});
-
-// Execute execMainGraph
-myQueue.ext_oneapi_graph(execMainGraph);
-
-// Record a second graph which records the same kernels, but using ptrB instead
-sycl_ext::command_graph updateGraph(myQueue);
-updateGraph.begin_recording(myQueue);
-run_kernels(ptrB, myQueue);
-updateGraph.end_recording();
-
-// Update execMainGraph using updateGraph. We do not need to finalize
-// updateGraph (this would be expensive)
-execMainGraph.update(updateGraph);
-
-// Execute execMainGraph again, which will now be operating on ptrB instead of
-// ptrA
-myQueue.ext_oneapi_graph(execMainGraph);
-----
+Detailed code examples and usage guidelines are provided in the
+link:../../SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].
 
 == Future Direction [[future-direction]]
 
diff --git a/sycl/doc/index.rst b/sycl/doc/index.rst
@@ -18,6 +18,7 @@ Using oneAPI DPC++ for Application Development
    User API Reference <https://intel.github.io/llvm-docs/doxygen/group__sycl__api.html>
    EnvironmentVariables
    MultiTileCardWithLevelZero
+   syclgraph/SYCLGraphUsageGuide
 
 Design Documents for the oneAPI DPC++ Compiler
 ----------------------------------------------
diff --git a/sycl/doc/syclgraph/SYCLGraphUsageGuide.md b/sycl/doc/syclgraph/SYCLGraphUsageGuide.md