intel · bader · Feb 22, 2023 · Jan 6, 2023 · Feb 3, 2023 · Feb 6, 2023
@@ -90,9 +90,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
     return true;
   }
 
-  // Increase the inlining cost threshold by a factor of 5, reflecting that
+  // Increase the inlining cost threshold by a factor of 11, reflecting that
   // calls are particularly expensive in NVPTX.
-  unsigned getInliningThresholdMultiplier() { return 5; }
+  unsigned getInliningThresholdMultiplier() { return 11; }
 
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

@@ -241,16 +241,6 @@ struct AccHostDataT {
   void *Reserved = nullptr;
 };
 
-// To ensure loop unrolling is done when processing dimensions.
-template <size_t... Inds, class F>
-void dim_loop_impl(std::integer_sequence<size_t, Inds...>, F &&f) {
-  (f(Inds), ...);
-}
-
-template <size_t count, class F> void dim_loop(F &&f) {
-  dim_loop_impl(std::make_index_sequence<count>{}, std::forward<F>(f));
-}
-
 void __SYCL_EXPORT constructorNotification(void *BufferObj, void *AccessorObj,
                                            access::target Target,
                                            access::mode Mode,

@@ -245,6 +245,16 @@ getSPIRVMemorySemanticsMask(const access::fence_space AccessSpace,
                  LocalScopeMask);
 }
 
+// To ensure loop unrolling is done when processing dimensions.
+template <size_t... Inds, class F>
+void dim_loop_impl(std::integer_sequence<size_t, Inds...>, F &&f) {
+  (f(Inds), ...);
+}
+
+template <size_t count, class F> void dim_loop(F &&f) {
+  dim_loop_impl(std::make_index_sequence<count>{}, std::forward<F>(f));
+}
+
 } // namespace detail
 
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)

@@ -214,23 +214,24 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) {
 #endif
 }
 
-template <typename Group, typename T, class BinaryOperation>
-detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     detail::is_vector_arithmetic<T>::value &&
-                     detail::is_native_op<T, BinaryOperation>::value),
-                    T>
-reduce_over_group(Group g, T x, BinaryOperation binary_op) {
+template <typename Group, typename T, int N, class BinaryOperation>
+detail::enable_if_t<
+    (is_group_v<std::decay_t<Group>> &&
+     detail::is_vector_arithmetic<sycl::vec<T, N>>::value &&
+     detail::is_native_op<sycl::vec<T, N>, BinaryOperation>::value),
+    sycl::vec<T, N>>
+reduce_over_group(Group g, sycl::vec<T, N> x, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(x[0], x[0])),
-                   typename T::element_type>::value ||
-          (std::is_same<T, half>::value &&
+                   typename sycl::vec<T, N>::element_type>::value ||
+          (std::is_same<sycl::vec<T, N>, half>::value &&
            std::is_same<decltype(binary_op(x[0], x[0])), float>::value),
       "Result type of binary_op must match reduction accumulation type.");
-  T result;
-  for (int s = 0; s < x.size(); ++s) {
-    result[s] = reduce_over_group(g, x[s], binary_op);
-  }
+  sycl::vec<T, N> result;
+
+  detail::dim_loop<N>(
+      [&](size_t s) { result[s] = reduce_over_group(g, x[s], binary_op); });
   return result;
 }