[SYCL][USM] Initial commit of flattening for kernel submission on queue (#911)

jbrodman · bader · commit c5318c56e403 · 2019-12-19T13:03:20.000+03:00
Add utility methods to flatten kernel submission to 1 lambda when using USM.

Events for depends_on are passed as an extra function arg.

Signed-off-by: James Brodman james.brodman@intel.com
diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
@@ -117,8 +117,190 @@ class queue {
   }
 
   event prefetch(const void* Ptr, size_t Count) {
-    return submit([=](handler &cgh) {
-        cgh.prefetch(Ptr, Count);
+    return submit([=](handler &CGH) {
+        CGH.prefetch(Ptr, Count);
+    });
+  }
+
+  /// single_task version with a kernel represented as a lambda.
+  ///
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event single_task(KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.template single_task<KernelName, KernelType>(KernelFunc);
+    });
+  }
+
+  /// single_task version with a kernel represented as a lambda.
+  ///
+  /// @param DepEvent is an event that specifies the kernel dependences 
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event single_task(event DepEvent, KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.depends_on(DepEvent);
+      CGH.template single_task<KernelName, KernelType>(KernelFunc);
+    });
+  }
+
+  /// single_task version with a kernel represented as a lambda.
+  ///
+  /// @param DepEvents is a vector of events that specify the kernel dependences
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event single_task(std::vector<event> DepEvents, KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.depends_on(DepEvents);
+      CGH.template single_task<KernelName, KernelType>(KernelFunc);
+    });
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// @param NumWorkItems is a range that specifies the work space of the kernel
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
+                                                              KernelFunc);
+    });
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.  
+  ///
+  /// @param NumWorkItems is a range that specifies the work space of the kernel
+  /// @param DepEvent is an event that specifies the kernel dependences 
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(range<Dims> NumWorkItems, event DepEvent,
+                     KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.depends_on(DepEvent);
+      CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
+                                                              KernelFunc);
+    });
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// @param NumWorkItems is a range that specifies the work space of the kernel
+  /// @param DepEvents is a vector of events that specifies the kernel dependences 
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(range<Dims> NumWorkItems, std::vector<event> DepEvents,
+                     KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.depends_on(DepEvents);
+      CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
+                                                              KernelFunc);
+    });
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + range and
+  /// offset that specify global size and global offset correspondingly.
+  ///
+  /// @param NumWorkItems is a range that specifies the work space of the kernel
+  /// @param WorkItemOffset specifies the offset for each work item id
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+                    KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.template parallel_for<KernelName, KernelType, Dims>(
+          NumWorkItems, WorkItemOffset, KernelFunc);
+    });
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + range and
+  /// offset that specify global size and global offset correspondingly.
+  ///
+  /// @param NumWorkItems is a range that specifies the work space of the kernel
+  /// @param WorkItemOffset specifies the offset for each work item id
+  /// @param DepEvent is an event that specifies the kernel dependences 
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+                     event DepEvent, KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.depends_on(DepEvent);
+      CGH.template parallel_for<KernelName, KernelType, Dims>(
+          NumWorkItems, WorkItemOffset, KernelFunc);
+    });
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + range and
+  /// offset that specify global size and global offset correspondingly.
+  ///
+  /// @param NumWorkItems is a range that specifies the work space of the kernel
+  /// @param WorkItemOffset specifies the offset for each work item id
+  /// @param DepEvents is a vector of events that specifies the kernel dependences 
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+                     std::vector<event> DepEvents, KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.depends_on(DepEvents);
+      CGH.template parallel_for<KernelName, KernelType, Dims>(
+          NumWorkItems, WorkItemOffset, KernelFunc);
+    });
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + nd_range that
+  /// specifies global, local sizes and offset.
+  ///
+  /// @param ExecutionRange is a range that specifies the work space of the kernel
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
+                                                              KernelFunc);
+    });
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + nd_range that
+  /// specifies global, local sizes and offset.
+  ///
+  /// @param ExecutionRange is a range that specifies the work space of the kernel
+  /// @param DepEvent is an event that specifies the kernel dependences 
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(nd_range<Dims> ExecutionRange,
+                     event DepEvent, KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.depends_on(DepEvent);
+      CGH.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
+                                                              KernelFunc);
+    });
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + nd_range that
+  /// specifies global, local sizes and offset.
+  ///
+  /// @param ExecutionRange is a range that specifies the work space of the kernel
+  /// @param DepEvents is a vector of events that specifies the kernel dependences 
+  /// @param KernelFunc is the Kernel functor or lambda
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(nd_range<Dims> ExecutionRange,
+                     std::vector<event> DepEvents, KernelType KernelFunc) {
+    return submit([&](handler &CGH) {
+      CGH.depends_on(DepEvents);
+      CGH.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
+                                                              KernelFunc);
     });
   }
 
diff --git a/sycl/test/usm/pfor_flatten.cpp b/sycl/test/usm/pfor_flatten.cpp
@@ -0,0 +1,66 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda %s -o %t1.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==--------------- pfor_flatten.cpp - Kernel Launch Flattening test -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+class foo;
+int main() {
+  int *array = nullptr;
+  const int N = 42;
+  const int MAGIC_NUM = 42;
+
+  queue q;
+  auto ctxt = q.get_context();
+
+  array = (int *)malloc_host(N * sizeof(int), q);
+  if (array == nullptr) {
+    return -1;
+  }
+
+  range<1> R{N};
+  auto e1 = q.parallel_for(R, [=](id<1> ID) {
+    int i = ID[0];
+    array[i] = MAGIC_NUM-4;
+  });
+
+
+  auto e2 = q.parallel_for(R, e1, [=](id<1> ID) {
+    int i = ID[0];
+    array[i] += 2;
+  });
+
+  auto e3 =
+      q.parallel_for(nd_range<1>{R, range<1>{1}}, {e1, e2}, [=](nd_item<1> ID) {
+        int i = ID.get_global_id(0);
+        array[i]++;
+      });
+
+  q.single_task({e3}, [=]() {
+    for (int i = 0; i < N; i++) {
+      array[i]++;
+    }
+  });
+
+  q.wait();
+  
+  for (int i = 0; i < N; i++) {
+    if (array[i] != MAGIC_NUM) {
+      return -1;
+    }
+  }
+  free(array, ctxt);
+
+  return 0;
+}