Update the kernel_queue_specific::max_num_work_group query (#16051)

0x12CC · martygrant · web-flow · commit 336b89e285f7 · 2024-12-06T12:04:31.000Z
Update calls to `urKernelSuggestMaxCooperativeGroupCountExp` to match the changes in oneapi-src/unified-runtime#2316. This PR also implements the `range<1>` and `range<2>` overloads of the `ext_oneapi_get_info` API from [sycl_ext_oneapi_launch_queries](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc). --------- Signed-off-by: Michael Aziz <michael.aziz@intel.com> Co-authored-by: Martin Morrison-Grant <martin.morrisongrant@codeplay.com>
diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake
@@ -1,7 +1,7 @@
-# commit 27398080349f1d8d21d6a8680e234d29dcd14734 (HEAD, origin/main, origin/HEAD)
-# Merge: 572355db942d dc971af72a31
+# commit 72e80a42cc8e5b11d43dd9d34b40d470e1476181 (HEAD, origin/main, origin/HEAD)
+# Merge:  6e5d0e6b9a47 9c7e56cc765b
 # Author: Martin Grant <martin.morrisongrant@codeplay.com>
-# Date:   Thu Dec 5 14:57:07 2024 +0000
-#   Merge pull request #2293 from yingcong-wu/yc-PR/241107-misc-minor-fix
-#   [DeviceAsan] Serval bug fixes
-set(UNIFIED_RUNTIME_TAG 27398080349f1d8d21d6a8680e234d29dcd14734)
+# Date:   Fri Dec 6 10:11:15 2024 +0000
+#   Merge pull request #2316 from 0x12CC/coop_kernel_query
+#   Change `urSuggestMaxCooperativeGroupCountExp` to accept ND size parameter
+set(UNIFIED_RUNTIME_TAG 72e80a42cc8e5b11d43dd9d34b40d470e1476181)
diff --git a/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def b/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def
@@ -1,4 +1 @@
-// TODO: Revisit 'max_num_work_group_sync' and align it with the
-// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
-__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
 __SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)
diff --git a/sycl/include/sycl/kernel.hpp b/sycl/include/sycl/kernel.hpp
@@ -168,6 +168,32 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
   typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
   ext_oneapi_get_info(queue Queue) const;
 
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
+
   /// Query queue/launch-specific information from a kernel using the
   /// info::kernel_queue_specific descriptor for a specific Queue and values.
   /// max_num_work_groups is the only valid descriptor for this function.
diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp
@@ -106,38 +106,6 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
       "interoperability function or to query a device built-in kernel");
 }
 
-bool kernel_impl::exceedsOccupancyResourceLimits(
-    const device &Device, const range<3> &WorkGroupSize,
-    size_t DynamicLocalMemorySize) const {
-  // Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
-  // Generally, exceeding hardware resource limits will yield in an error when
-  // the kernel is launched.
-  const size_t MaxWorkGroupSize =
-      get_info<info::kernel_device_specific::work_group_size>(Device);
-  const size_t MaxLocalMemorySizeInBytes =
-      Device.get_info<info::device::local_mem_size>();
-
-  if (WorkGroupSize.size() > MaxWorkGroupSize)
-    return true;
-
-  if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
-    return true;
-
-  // It will be impossible to launch a kernel for Cuda when the hardware limit
-  // for the 32-bit registers page file size is exceeded.
-  if (Device.get_backend() == backend::ext_oneapi_cuda) {
-    const uint32_t RegsPerWorkItem =
-        get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
-    const uint32_t MaxRegsPerWorkGroup =
-        Device.get_info<ext::codeplay::experimental::info::device::
-                            max_registers_per_work_group>();
-    if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
-      return true;
-  }
-
-  return false;
-}
-
 template <>
 typename info::platform::version::return_type
 kernel_impl::get_backend_info<info::platform::version>() const {
diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp
@@ -123,6 +123,32 @@ class kernel_impl {
   template <typename Param>
   typename Param::return_type ext_oneapi_get_info(queue Queue) const;
 
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename Param::return_type
+  ext_oneapi_get_info(queue Queue, const range<1> &MaxWorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename Param::return_type
+  ext_oneapi_get_info(queue Queue, const range<2> &MaxWorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
+
   /// Query queue/launch-specific information from a kernel using the
   /// info::kernel_queue_specific descriptor for a specific Queue and values.
   /// max_num_work_groups is the only valid descriptor for this function.
@@ -193,11 +219,49 @@ class kernel_impl {
 
   /// Check if the occupancy limits are exceeded for the given kernel launch
   /// configuration.
+  template <int Dimensions>
   bool exceedsOccupancyResourceLimits(const device &Device,
-                                      const range<3> &WorkGroupSize,
+                                      const range<Dimensions> &WorkGroupSize,
                                       size_t DynamicLocalMemorySize) const;
+  template <int Dimensions>
+  size_t queryMaxNumWorkGroups(queue Queue,
+                               const range<Dimensions> &WorkGroupSize,
+                               size_t DynamicLocalMemorySize) const;
 };
 
+template <int Dimensions>
+bool kernel_impl::exceedsOccupancyResourceLimits(
+    const device &Device, const range<Dimensions> &WorkGroupSize,
+    size_t DynamicLocalMemorySize) const {
+  // Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
+  // Generally, exceeding hardware resource limits will yield in an error when
+  // the kernel is launched.
+  const size_t MaxWorkGroupSize =
+      get_info<info::kernel_device_specific::work_group_size>(Device);
+  const size_t MaxLocalMemorySizeInBytes =
+      Device.get_info<info::device::local_mem_size>();
+
+  if (WorkGroupSize.size() > MaxWorkGroupSize)
+    return true;
+
+  if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
+    return true;
+
+  // It will be impossible to launch a kernel for Cuda when the hardware limit
+  // for the 32-bit registers page file size is exceeded.
+  if (Device.get_backend() == backend::ext_oneapi_cuda) {
+    const uint32_t RegsPerWorkItem =
+        get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
+    const uint32_t MaxRegsPerWorkGroup =
+        Device.get_info<ext::codeplay::experimental::info::device::
+                            max_registers_per_work_group>();
+    if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
+      return true;
+  }
+
+  return false;
+}
+
 template <typename Param>
 inline typename Param::return_type kernel_impl::get_info() const {
   static_assert(is_kernel_info_desc<Param>::value,
@@ -244,13 +308,11 @@ kernel_impl::get_info(const device &Device,
 
 namespace syclex = ext::oneapi::experimental;
 
-template <>
-inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
-    return_type
-    kernel_impl::ext_oneapi_get_info<
-        syclex::info::kernel_queue_specific::max_num_work_groups>(
-        queue Queue, const range<3> &WorkGroupSize,
-        size_t DynamicLocalMemorySize) const {
+template <int Dimensions>
+size_t
+kernel_impl::queryMaxNumWorkGroups(queue Queue,
+                                   const range<Dimensions> &WorkGroupSize,
+                                   size_t DynamicLocalMemorySize) const {
   if (WorkGroupSize.size() == 0)
     throw exception(sycl::make_error_code(errc::invalid),
                     "The launch work-group size cannot be zero.");
@@ -259,12 +321,21 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
   const auto &Handle = getHandleRef();
   auto Device = Queue.get_device();
 
+  size_t WG[Dimensions];
+  WG[0] = WorkGroupSize[0];
+  if constexpr (Dimensions >= 2)
+    WG[1] = WorkGroupSize[1];
+  if constexpr (Dimensions == 3)
+    WG[2] = WorkGroupSize[2];
+
   uint32_t GroupCount{0};
   if (auto Result = Adapter->call_nocheck<
                     UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
-          Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
-      Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
-    // The feature is supported. Check for other errors and throw if any.
+          Handle, Dimensions, WG, DynamicLocalMemorySize, &GroupCount);
+      Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE &&
+      Result != UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE) {
+    // The feature is supported and the group size is valid. Check for other
+    // errors and throw if any.
     Adapter->checkUrResult(Result);
     return GroupCount;
   }
@@ -278,30 +349,33 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
 }
 
 template <>
-inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
     return_type
     kernel_impl::ext_oneapi_get_info<
-        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
-        queue Queue, const range<3> &WorkGroupSize,
+        syclex::info::kernel_queue_specific::max_num_work_groups>(
+        queue Queue, const range<1> &WorkGroupSize,
         size_t DynamicLocalMemorySize) const {
-  return ext_oneapi_get_info<
-      syclex::info::kernel_queue_specific::max_num_work_groups>(
-      Queue, WorkGroupSize, DynamicLocalMemorySize);
+  return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
 }
 
 template <>
-inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
     return_type
     kernel_impl::ext_oneapi_get_info<
-        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
-        queue Queue) const {
-  auto Device = Queue.get_device();
-  const auto MaxWorkGroupSize =
-      get_info<info::kernel_device_specific::work_group_size>(Device);
-  const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
-  return ext_oneapi_get_info<
-      syclex::info::kernel_queue_specific::max_num_work_group_sync>(
-      Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
+        syclex::info::kernel_queue_specific::max_num_work_groups>(
+        queue Queue, const range<2> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
+}
+
+template <>
+inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
+    return_type
+    kernel_impl::ext_oneapi_get_info<
+        syclex::info::kernel_queue_specific::max_num_work_groups>(
+        queue Queue, const range<3> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
 }
 
 } // namespace detail
diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp
@@ -113,26 +113,41 @@ kernel::ext_oneapi_get_info(queue Queue) const {
 
 template <typename Param>
 typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
-kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+kernel::ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
                             size_t DynamicLocalMemorySize) const {
   return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
                                           DynamicLocalMemorySize);
 }
 
-template __SYCL_EXPORT typename ext::oneapi::experimental::info::
-    kernel_queue_specific::max_num_work_group_sync::return_type
-    kernel::ext_oneapi_get_info<
-        ext::oneapi::experimental::info::kernel_queue_specific::
-            max_num_work_group_sync>(queue Queue) const;
+template <typename Param>
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
+                            size_t DynamicLocalMemorySize) const {
+  return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
+                                          DynamicLocalMemorySize);
+}
+
+template <typename Param>
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+                            size_t DynamicLocalMemorySize) const {
+  return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
+                                          DynamicLocalMemorySize);
+}
 
 #define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT)           \
   template __SYCL_EXPORT ReturnT                                               \
+  kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>(                \
+      queue, const range<1> &, size_t) const;                                  \
+  template __SYCL_EXPORT ReturnT                                               \
+  kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>(                \
+      queue, const range<2> &, size_t) const;                                  \
+  template __SYCL_EXPORT ReturnT                                               \
   kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>(                \
       queue, const range<3> &, size_t) const;
 // Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
 // kernel_queue_specific queries require the above-defined get_info interface.
 // clang-format off
-__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
 __SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
 // clang-format on
 #undef __SYCL_PARAM_TRAITS_SPEC
@@ -143,5 +158,53 @@ ur_native_handle_t kernel::getNative() const { return impl->getNative(); }
 
 ur_native_handle_t kernel::getNativeImpl() const { return impl->getNative(); }
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+// The following query was deprecated since it doesn't include a way to specify
+// the invdividual dimensions of the work group. All of the contents of this
+// #ifndef block should be removed during the next ABI breaking window.
+namespace ext::oneapi::experimental::info::kernel_queue_specific {
+struct max_num_work_group_sync {
+  using return_type = size_t;
+};
+} // namespace ext::oneapi::experimental::info::kernel_queue_specific
+template <>
+struct detail::is_kernel_queue_specific_info_desc<
+    ext::oneapi::experimental::info::kernel_queue_specific::
+        max_num_work_group_sync> : std::true_type {
+  using return_type = ext::oneapi::experimental::info::kernel_queue_specific::
+      max_num_work_group_sync::return_type;
+};
+template <>
+__SYCL2020_DEPRECATED(
+    "The 'max_num_work_group_sync' query is deprecated. See "
+    "'sycl_ext_oneapi_launch_queries' for the new 'max_num_work_groups' query.")
+__SYCL_EXPORT typename ext::oneapi::experimental::info::kernel_queue_specific::
+    max_num_work_group_sync::return_type kernel::ext_oneapi_get_info<
+        ext::oneapi::experimental::info::kernel_queue_specific::
+            max_num_work_group_sync>(queue Queue, const range<3> &WorkGroupSize,
+                                     size_t DynamicLocalMemorySize) const {
+  return ext_oneapi_get_info<ext::oneapi::experimental::info::
+                                 kernel_queue_specific::max_num_work_groups>(
+      Queue, WorkGroupSize, DynamicLocalMemorySize);
+}
+template <>
+__SYCL2020_DEPRECATED(
+    "The 'max_num_work_group_sync' query is deprecated. See "
+    "'sycl_ext_oneapi_launch_queries' for the new 'max_num_work_groups' query.")
+__SYCL_EXPORT typename ext::oneapi::experimental::info::kernel_queue_specific::
+    max_num_work_group_sync::return_type kernel::ext_oneapi_get_info<
+        ext::oneapi::experimental::info::kernel_queue_specific::
+            max_num_work_group_sync>(queue Queue) const {
+  auto Device = Queue.get_device();
+  const auto MaxWorkGroupSize =
+      get_info<info::kernel_device_specific::work_group_size>(Device);
+  const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
+  return ext_oneapi_get_info<ext::oneapi::experimental::info::
+                                 kernel_queue_specific::max_num_work_groups>(
+      Queue, WorkGroupSize,
+      /* DynamicLocalMemorySize */ 0);
+}
+#endif
+
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp b/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp
@@ -162,14 +162,10 @@ int test_max_num_work_groups(sycl::queue &q, const sycl::device &dev) {
   // It cannot be possible to launch a kernel successfully with a configuration
   // that exceeds the available resources as in the above defined workGroupSize.
   // workGroupSize is larger than maxWorkGroupSize, hence maxWGs must equal 0.
-  // Note: Level-Zero currently always returns a non-zero value.
-  // TODO: Remove the backend condition once the Level-Zero API issue is fixed.
-  if (dev.get_backend() != sycl::backend::ext_oneapi_level_zero) {
-    assert(maxWGs == 0 &&
-           "max_num_work_groups query failed.\n"
-           "It should return 0 possible groups when the requested resources "
-           "by the lanuch config exceed those available in the hardware.");
-  }
+  assert(maxWGs == 0 &&
+         "max_num_work_groups query failed.\n"
+         "It should return 0 possible groups when the requested resources "
+         "by the lanuch config exceed those available in the hardware.");
 
   // As we ensured that the 'max_num_work_groups' query correctly
   // returns 0 possible work-groups, test that the kernel launch will fail.
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump