[SYCL] Pass handler & instead of queue across ABI for reduction utils

aelovikov-intel · aelovikov-intel · commit f59a17bd178b · 2025-06-06T10:04:42.000-07:00
Queue might be `nullptr` in case of graph, but the information this
utils query is device-specific. By passing entire `handler &amp;` and having
access to graph information we'd be able to return more precise results.

Another positive side-effect is that we eliminiate explicit
`std::shared_ptr&lt;queue_impl&gt;` which is a small step forward in the
ongoing refactoring efforts to prefer passing `*_impl` by raw ptr/ref
with explicit `shared_from_this` whenever lifetimes need to be extended.
diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
@@ -140,17 +140,10 @@ template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
   return sycl::detail::make_tuple(Elements...);
 }
 
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-__SYCL_EXPORT size_t reduGetMaxWGSize(const std::shared_ptr<queue_impl> &Queue,
-                                      size_t LocalMemBytesPerWorkItem);
-__SYCL_EXPORT size_t reduGetPreferredWGSize(
-    const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem);
-#else
-__SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
+__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
                                       size_t LocalMemBytesPerWorkItem);
-__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
+__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
                                             size_t LocalMemBytesPerWorkItem);
-#endif
 __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
                                        size_t &NWorkGroups);
 
@@ -1708,8 +1701,7 @@ struct NDRangeReduction<
     reduction::strategy::group_reduce_and_multiple_kernels> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
+  static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
                   nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     static_assert(Reduction::has_identity,
@@ -1729,7 +1721,7 @@ struct NDRangeReduction<
     // TODO: currently the maximal work group size is determined for the given
     // queue/device, while it may be safer to use queries to the kernel compiled
     // for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
+    size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
     if (NDRange.get_local_range().size() > MaxWGSize)
       throw sycl::exception(make_error_code(errc::nd_range),
                             "The implementation handling parallel_for with"
@@ -1847,8 +1839,7 @@ struct NDRangeReduction<
 template <> struct NDRangeReduction<reduction::strategy::basic> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename KernelType, typename Reduction>
-  static void run(handler &CGH,
-                  const std::shared_ptr<detail::queue_impl> &Queue,
+  static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
                   nd_range<Dims> NDRange, PropertiesT &Properties,
                   Reduction &Redu, KernelType &KernelFunc) {
     using element_type = typename Reduction::reducer_element_type;
@@ -1858,7 +1849,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
     // TODO: currently the maximal work group size is determined for the given
     // queue/device, while it may be safer to use queries to the kernel
     // compiled for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
+    size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
     if (NDRange.get_local_range().size() > MaxWGSize)
       throw sycl::exception(make_error_code(errc::nd_range),
                             "The implementation handling parallel_for with"
@@ -2623,9 +2614,9 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
 template <> struct NDRangeReduction<reduction::strategy::multi> {
   template <typename KernelName, int Dims, typename PropertiesT,
             typename... RestT>
-  static void
-  run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
-      nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
+  static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
+                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  RestT... Rest) {
     std::tuple<RestT...> ArgsTuple(Rest...);
     constexpr size_t NumArgs = sizeof...(RestT);
     auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
@@ -2636,7 +2627,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
     // TODO: currently the maximal work group size is determined for the given
     // queue/device, while it is safer to use queries to the kernel compiled
     // for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
+    size_t MaxWGSize = reduGetMaxWGSize(CGH, LocalMemPerWorkItem);
     if (NDRange.get_local_range().size() > MaxWGSize)
       throw sycl::exception(make_error_code(errc::nd_range),
                             "The implementation handling parallel_for with"
@@ -2731,8 +2722,7 @@ void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
                                                        Properties, Rest...);
 }
 
-__SYCL_EXPORT uint32_t
-reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
+__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh);
 
 template <typename KernelName, reduction::strategy Strategy, int Dims,
           typename PropertiesT, typename... RestT>
@@ -2763,13 +2753,13 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
       __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
 #else
-      reduGetMaxNumConcurrentWorkGroups(CGH.MQueue);
+      reduGetMaxNumConcurrentWorkGroups(CGH);
 #endif
 
   // TODO: currently the preferred work group size is determined for the given
   // queue/device, while it is safer to use queries to the kernel pre-compiled
   // for the device.
-  size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
+  size_t PrefWGSize = reduGetPreferredWGSize(CGH, OneElemSize);
 
   size_t NWorkItems = Range.size();
   size_t WGSize = std::min(NWorkItems, PrefWGSize);
diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp
@@ -49,8 +49,24 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
   return WGSize;
 }
 
+#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
+// Inline this helper:
+#endif
+uint32_t reduGetMaxNumConcurrentWorkGroups(device_impl &Dev) {
+  uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
+  // TODO: The heuristics here require additional tuning for various devices
+  // and vendors. Also, it would be better to check vendor/generation/etc.
+  if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
+    NumThreads *= 8;
+  return NumThreads;
+}
 // Returns the estimated number of physical threads on the device associated
 // with the given queue.
+__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh) {
+  return reduGetMaxNumConcurrentWorkGroups(getSyclObjImpl(cgh)->get_device());
+}
+
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
     std::shared_ptr<sycl::detail::queue_impl> Queue) {
   // TODO: Graphs extension explicit API uses a handler with no queue attached,
@@ -63,25 +79,14 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
   if (Queue == nullptr) {
     return 8;
   }
-  device Dev = Queue->get_device();
-  uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
-  // TODO: The heuristics here require additional tuning for various devices
-  // and vendors. Also, it would be better to check vendor/generation/etc.
-  if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
-    NumThreads *= 8;
-  return NumThreads;
+  return reduGetMaxNumConcurrentWorkGroups(Queue->getDeviceImpl());
 }
+#endif
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-__SYCL_EXPORT size_t
-reduGetMaxWGSize(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
-                 size_t LocalMemBytesPerWorkItem) {
-#else
-__SYCL_EXPORT size_t
-reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
-                 size_t LocalMemBytesPerWorkItem) {
+// Inline this helper:
 #endif
-  device Dev = Queue->get_device();
+size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) {
   size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();
 
   size_t WGSizePerMem = MaxWGSize * 2;
@@ -118,26 +123,24 @@ reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
 
   return WGSize;
 }
+__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
+                                      size_t LocalMemBytesPerWorkItem) {
+  return reduGetMaxWGSize(getSyclObjImpl(cgh)->get_device(),
+                          LocalMemBytesPerWorkItem);
+}
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+__SYCL_EXPORT
+size_t reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
+                        size_t LocalMemBytesPerWorkItem) {
+  return reduGetMaxWGSize(Queue->getDeviceImpl(), LocalMemBytesPerWorkItem);
+}
+#endif
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-__SYCL_EXPORT size_t reduGetPreferredWGSize(
-    const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem) {
-#else
-__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
-                                            size_t LocalMemBytesPerWorkItem) {
+// Inline this helper:
 #endif
-  // TODO: Graphs extension explicit API uses a handler with a null queue to
-  // process CGFs, in future we should have access to the device so we can
-  // correctly calculate this.
-  //
-  // The 32 value was chosen as the hardcoded value as it is the returned
-  // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
-  // Intel HD Graphics devices used as a L0 backend during development.
-  if (Queue == nullptr) {
-    return 32;
-  }
-  device Dev = Queue->get_device();
-
+size_t reduGetPreferredWGSize(device_impl &Dev,
+                              size_t LocalMemBytesPerWorkItem) {
   // The maximum WGSize returned by CPU devices is very large and does not
   // help the reduction implementation: since all work associated with a
   // work-group is typically assigned to one CPU thread, selecting a large
@@ -174,8 +177,31 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
   }
 
   // Use the maximum work-group size otherwise.
-  return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
+  return reduGetMaxWGSize(Dev, LocalMemBytesPerWorkItem);
+}
+__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
+                                            size_t LocalMemBytesPerWorkItem) {
+  return reduGetPreferredWGSize(getSyclObjImpl(cgh)->get_device(),
+                                LocalMemBytesPerWorkItem);
 }
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
+                                            size_t LocalMemBytesPerWorkItem) {
+  // TODO: Graphs extension explicit API uses a handler with a null queue to
+  // process CGFs, in future we should have access to the device so we can
+  // correctly calculate this.
+  //
+  // The 32 value was chosen as the hardcoded value as it is the returned
+  // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
+  // Intel HD Graphics devices used as a L0 backend during development.
+  if (Queue == nullptr) {
+    return 32;
+  }
+  device_impl &Dev = Queue->getDeviceImpl();
+
+  return reduGetPreferredWGSize(Dev, LocalMemBytesPerWorkItem);
+}
+#endif
 
 __SYCL_EXPORT void
 addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3290,6 +3290,7 @@ _ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6a
 _ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE
 _ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviibmbRKNS0_13property_listE
 _ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE
+_ZN4sycl3_V16detail16reduGetMaxWGSizeERNS0_7handlerEm
 _ZN4sycl3_V16detail16reduGetMaxWGSizeESt10shared_ptrINS1_10queue_implEEm
 _ZN4sycl3_V16detail17HostProfilingInfo3endEv
 _ZN4sycl3_V16detail17HostProfilingInfo5startEv
@@ -3324,6 +3325,7 @@ _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6devi
 _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE
 _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE
 _ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE
+_ZN4sycl3_V16detail22reduGetPreferredWGSizeERNS0_7handlerEm
 _ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm
 _ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE
 _ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE
@@ -3343,6 +3345,7 @@ _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHost6getPtrEv
 _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC1ENS0_5rangeILi3EEENS0_6access4modeEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderERKNS0_13property_listE
 _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC2ENS0_5rangeILi3EEENS0_6access4modeEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderERKNS0_13property_listE
 _ZN4sycl3_V16detail33enable_ext_oneapi_default_contextEb
+_ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsERNS0_7handlerE
 _ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsESt10shared_ptrINS1_10queue_implEE
 _ZN4sycl3_V16detail34addHostSampledImageAccessorAndWaitEPNS1_28SampledImageAccessorImplHostE
 _ZN4sycl3_V16detail35sampledImageConstructorNotificationEPvS2_RKSt8optionalINS0_12image_targetEEPKvjRKNS1_13code_locationE
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
@@ -4342,9 +4342,12 @@
 ?processArg@handler@_V1@sycl@@AEAAXPEAXAEBW4kernel_param_kind_t@detail@23@H_KAEA_K_N4@Z
 ?query@tls_code_loc_t@detail@_V1@sycl@@QEAAAEBUcode_location@234@XZ
 ?reduComputeWGSize@detail@_V1@sycl@@YA_K_K0AEA_K@Z
+?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIAEAVhandler@23@@Z
 ?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@@Z
+?reduGetMaxWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z
 ?reduGetMaxWGSize@detail@_V1@sycl@@YA_KV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z
 ?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z
+?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z
 ?registerDynamicParameter@handler@_V1@sycl@@AEAAXAEAVdynamic_parameter_base@detail@experimental@oneapi@ext@23@H@Z
 ?registerDynamicParameter@handler@_V1@sycl@@AEAAXPEAVdynamic_parameter_impl@detail@experimental@oneapi@ext@23@H@Z
 ?release_external_memory@experimental@oneapi@ext@_V1@sycl@@YAXUexternal_mem@12345@AEBVdevice@45@AEBVcontext@45@@Z