intel
diff --git a/‎sycl/include/sycl/detail/common.hpp
Lines changed: 24 additions & 0 deletions b/‎sycl/include/sycl/detail/common.hpp
Lines changed: 24 additions & 0 deletions
diff --git a/‎sycl/include/sycl/ext/oneapi/reduction.hpp
Lines changed: 85 additions & 42 deletions b/‎sycl/include/sycl/ext/oneapi/reduction.hpp
Lines changed: 85 additions & 42 deletions
diff --git a/‎sycl/include/sycl/handler.hpp
Lines changed: 4 additions & 6 deletions b/‎sycl/include/sycl/handler.hpp
Lines changed: 4 additions & 6 deletions
diff --git a/‎sycl/include/sycl/queue.hpp
Lines changed: 2 additions & 30 deletions b/‎sycl/include/sycl/queue.hpp
Lines changed: 2 additions & 30 deletions
@@ -83,6 +83,30 @@ struct code_location {
   unsigned long MLineNo;
   unsigned long MColumnNo;
 };
+
+// The C++ FE may instrument user calls with code location metadata.
+// If it does then that will appear as an extra last argument.
+// Having _TWO_ mid-param #ifdefs makes the functions very difficult to read.
+// Here we simplify the &CodeLoc declaration to be _CODELOCPARAM(&CodeLoc) and
+// _CODELOCARG(&CodeLoc).
+
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+#define _CODELOCONLYPARAM(a)                                                   \
+  const detail::code_location a = detail::code_location::current()
+#define _CODELOCPARAM(a)                                                       \
+  , const detail::code_location a = detail::code_location::current()
+#define _CODELOCPARAMDEF(a) , const detail::code_location a
+
+#define _CODELOCARG(a)
+#define _CODELOCFW(a) , a
+#else
+#define _CODELOCONLYPARAM(a)
+#define _CODELOCPARAM(a)
+
+#define _CODELOCARG(a) const detail::code_location a = {}
+#define _CODELOCFW(a)
+#endif
+
 } // namespace detail
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
 
@@ -649,6 +649,13 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
     }
   }
 
+  template <class _T = T, int D = buffer_dim>
+  auto &getTempBuffer(size_t Size, handler &CGH) {
+    auto Buffer = std::make_shared<buffer<_T, D>>(range<1>(Size));
+    CGH.addReduction(Buffer);
+    return *Buffer;
+  }
+
   /// Returns an accessor accessing the memory that will hold the reduction
   /// partial sums.
   /// If \p Size is equal to one, then the reduction result is the final and
@@ -708,15 +715,28 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
     return {*CounterBuf, CGH};
   }
 
-  RedOutVar &getUserRedVar() { return MRedOut; }
-
-  static inline result_type *getOutPointer(const rw_accessor_type &OutAcc) {
-    return OutAcc.get_pointer().get();
+  // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
+  // extra kernel than copy it from the host.
+  template <typename Name> auto getGroupsCounterAccDiscrete(handler &CGH) {
+    auto &Buf = getTempBuffer<int, 1>(1, CGH);
+    std::shared_ptr<detail::queue_impl> QueueCopy = CGH.MQueue;
+    auto Event = CGH.withAuxHandler(QueueCopy, [&](handler &InitHandler) {
+      auto Acc = accessor{Buf, InitHandler, sycl::write_only, sycl::no_init};
+      InitHandler.single_task<Name>([=]() { Acc[0] = 0; });
+    });
+    CGH.depends_on(Event);
+    return accessor{Buf, CGH};
   }
 
+  RedOutVar &getUserRedVar() { return MRedOut; }
+
   static inline result_type *getOutPointer(result_type *OutPtr) {
     return OutPtr;
   }
+  template <class AccessorType>
+  static inline result_type *getOutPointer(const AccessorType &OutAcc) {
+    return OutAcc.get_pointer().get();
+  }
 
 private:
   template <typename BufferT>
@@ -892,7 +912,7 @@ template <class KernelName> struct RangeFastAtomics;
 } // namespace main_krn
 } // namespace reduction
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
                                    const range<Dims> &Range,
                                    const nd_range<1> &NDRange,
                                    Reduction &Redu) {
@@ -927,29 +947,43 @@ void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
       Reducer.template atomic_combine(Reduction::getOutPointer(Out));
     }
   });
+  return Reduction::is_usm || Redu.initializeToIdentity();
 }
 
 namespace reduction {
 namespace main_krn {
 template <class KernelName> struct RangeFastReduce;
 } // namespace main_krn
+namespace init_krn {
+template <class KernelName> struct GroupCounter;
+}
 } // namespace reduction
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
                                   const range<Dims> &Range,
                                   const nd_range<1> &NDRange, Reduction &Redu) {
   constexpr size_t NElements = Reduction::num_elements;
   size_t WGSize = NDRange.get_local_range().size();
   size_t NWorkGroups = NDRange.get_group_range().size();
 
+  auto &Out = Redu.getUserRedVar();
+  if constexpr (Reduction::is_acc)
+    associateWithHandler(CGH, &Out, access::target::device);
+
+  auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
+  accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
+
   bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
-  auto PartialSums =
-      Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
-  auto Out = (NWorkGroups == 1)
-                 ? PartialSums
-                 : Redu.getWriteAccForPartialReds(NElements, CGH);
+  using InitName =
+      __sycl_reduction_kernel<reduction::init_krn::GroupCounter, KernelName>;
+
+  // Integrated/discrete GPUs have different faster path.
   auto NWorkGroupsFinished =
-      Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
+      sycl::detail::getDeviceFromHandler(CGH)
+              .get_info<info::device::host_unified_memory>()
+          ? Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH)
+          : Redu.template getGroupsCounterAccDiscrete<InitName>(CGH);
+
   auto DoReducePartialSumsInLastWG =
       Reduction::template getReadWriteLocalAcc<int>(1, CGH);
 
@@ -967,50 +1001,57 @@ void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
     // reduce_over_group is only defined for each T, not for span<T, ...>
     size_t LID = NDId.get_local_id(0);
     for (int E = 0; E < NElements; ++E) {
-      Reducer.getElement(E) =
-          reduce_over_group(Group, Reducer.getElement(E), BOp);
-
+      auto &RedElem = Reducer.getElement(E);
+      RedElem = reduce_over_group(Group, RedElem, BOp);
       if (LID == 0) {
-        if (NWorkGroups == 1 && IsUpdateOfUserVar)
-          Reducer.getElement(E) =
-              BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
-
-        // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
-        Reduction::getOutPointer(
-            PartialSums)[NDId.get_group_linear_id() * NElements + E] =
-            Reducer.getElement(E);
+        if (NWorkGroups == 1) {
+          auto &OutElem = Reduction::getOutPointer(Out)[E];
+          // Can avoid using partial sum and write the final result immediately.
+          if (IsUpdateOfUserVar)
+            RedElem = BOp(RedElem, OutElem);
+          OutElem = RedElem;
+        } else {
+          PartialSums[NDId.get_group_linear_id() * NElements + E] =
+              Reducer.getElement(E);
+        }
       }
     }
 
+    if (NWorkGroups == 1)
+      // We're done.
+      return;
+
     // Signal this work-group has finished after all values are reduced
     if (LID == 0) {
       auto NFinished =
           sycl::atomic_ref<int, memory_order::relaxed, memory_scope::device,
                            access::address_space::global_space>(
               NWorkGroupsFinished[0]);
-      DoReducePartialSumsInLastWG[0] =
-          ++NFinished == NWorkGroups && NWorkGroups > 1;
+      DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
     }
 
     sycl::detail::workGroupBarrier();
     if (DoReducePartialSumsInLastWG[0]) {
       // Reduce each result separately
-      // TODO: Opportunity to parallelize across elements
+      // TODO: Opportunity to parallelize across elements.
       for (int E = 0; E < NElements; ++E) {
+        auto &OutElem = Reduction::getOutPointer(Out)[E];
         auto LocalSum = Reducer.getIdentity();
         for (size_t I = LID; I < NWorkGroups; I += WGSize)
           LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
-        Reducer.getElement(E) = reduce_over_group(Group, LocalSum, BOp);
+        auto Result = reduce_over_group(Group, LocalSum, BOp);
 
         if (LID == 0) {
           if (IsUpdateOfUserVar)
-            Reducer.getElement(E) =
-                BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
-          Reduction::getOutPointer(Out)[E] = Reducer.getElement(E);
+            Result = BOp(Result, OutElem);
+          OutElem = Result;
         }
       }
     }
   });
+
+  // We've updated user's variable, no extra work needed.
+  return false;
 }
 
 namespace reduction {
@@ -1019,7 +1060,7 @@ template <class KernelName> struct RangeBasic;
 } // namespace main_krn
 } // namespace reduction
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
                              const range<Dims> &Range,
                              const nd_range<1> &NDRange, Reduction &Redu) {
   constexpr size_t NElements = Reduction::num_elements;
@@ -1125,10 +1166,13 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
       }
     }
   });
+  return Reduction::is_usm || Reduction::is_dw_acc;
 }
 
+/// Returns "true" if the result has to be saved to user's variable by
+/// reduSaveFinalResultToUserMem.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
                         const range<Dims> &Range, size_t MaxWGSize,
                         uint32_t NumConcurrentWorkGroups, Reduction &Redu) {
   size_t NWorkItems = Range.size();
@@ -1141,16 +1185,15 @@ void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
   size_t NDRItems = NWorkGroups * WGSize;
   nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
 
-  if constexpr (Reduction::has_fast_atomics) {
-    reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
-                                              Redu);
-
-  } else if constexpr (Reduction::has_fast_reduce) {
-    reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
-                                             Redu);
-  } else {
-    reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange, Redu);
-  }
+  if constexpr (Reduction::has_fast_atomics)
+    return reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range,
+                                                     NDRange, Redu);
+  else if constexpr (Reduction::has_fast_reduce)
+    return reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range,
+                                                    NDRange, Redu);
+  else
+    return reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
+                                               Redu);
 }
 
 namespace reduction {
 
@@ -251,7 +251,7 @@ using sycl::detail::queue_impl;
 /// If we are given sycl::range and not sycl::nd_range we have more freedom in
 /// how to split the iteration space.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
                         const range<Dims> &Range, size_t MaxWGSize,
                         uint32_t NumConcurrentWorkGroups, Reduction &Redu);
 
@@ -1649,11 +1649,9 @@ class __SYCL_EXPORT handler {
     // for the device.
     size_t MaxWGSize =
         ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
-    ext::oneapi::detail::reduCGFuncForRange<KernelName>(
-        *this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
-    if (Reduction::is_usm ||
-        (Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
-        (!Reduction::has_fast_atomics && Reduction::is_dw_acc)) {
+    if (ext::oneapi::detail::reduCGFuncForRange<KernelName>(
+            *this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups,
+            Redu)) {
       this->finalize();
       MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
         ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
 
@@ -32,33 +32,9 @@
 #include <utility>
 
 // having _TWO_ mid-param #ifdefs makes the functions very difficult to read.
-// Here we simplify the &CodeLoc declaration to be _CODELOCPARAM(&CodeLoc) and
-// _CODELOCARG(&CodeLoc) Similarly, the KernelFunc param is simplified to be
+// Here we simplify the KernelFunc param is simplified to be
 // _KERNELFUNCPARAM(KernelFunc) Once the queue kernel functions are defined,
 // these macros are #undef immediately.
-
-// replace _CODELOCPARAM(&CodeLoc) with nothing
-// or :   , const detail::code_location &CodeLoc =
-// detail::code_location::current()
-// replace _CODELOCARG(&CodeLoc) with nothing
-// or :  const detail::code_location &CodeLoc = {}
-
-#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
-#define _CODELOCONLYPARAM(a)                                                   \
-  const detail::code_location a = detail::code_location::current()
-#define _CODELOCPARAM(a)                                                       \
-  , const detail::code_location a = detail::code_location::current()
-
-#define _CODELOCARG(a)
-#define _CODELOCFW(a) , a
-#else
-#define _CODELOCONLYPARAM(a)
-#define _CODELOCPARAM(a)
-
-#define _CODELOCARG(a) const detail::code_location a = {}
-#define _CODELOCFW(a)
-#endif
-
 // replace _KERNELFUNCPARAM(KernelFunc) with   KernelType KernelFunc
 //                                     or     const KernelType &KernelFunc
 #ifdef __SYCL_NONCONST_FUNCTOR__
@@ -1081,11 +1057,7 @@ class __SYCL_EXPORT queue {
         CodeLoc);
   }
 
-// Clean up CODELOC and KERNELFUNC macros.
-#undef _CODELOCPARAM
-#undef _CODELOCONLYPARAM
-#undef _CODELOCARG
-#undef _CODELOCFW
+// Clean KERNELFUNC macros.
 #undef _KERNELFUNCPARAM
 
   /// Returns whether the queue is in order or OoO