Renamed has_atomic_add_float to has_atomic_add_float64.

JackAKirk · JackAKirk · commit ab8a60000ca5 · 2021-06-24T11:37:52.000+01:00
Renamed has_atomic_add_float to has_atomic_add_float64, since the general usage that includes float32 is only expected to be temporary.
has_atomic_add_float64 is a pseudonym of IsReduOptForAtomic64Add.
Updated documentation describing the current temporary usage of fp32 within IsReduOptForAtomic64Add.
IsReduOptForFastFloatAtomicAdd has been renamed IsReduOptForAtomic64Add to distinguish that this boolean should only be used in the case that the device has the sycl2020 atomic64 aspect, consistent with the naming convention used in other functions that are specializations for the atomic64 aspect.

Signed-off-by: JackAKirk &lt;jack.kirk@codeplay.com&gt;
diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -53,14 +53,20 @@ using IsReduOptForFastAtomicFetch =
                    sycl::detail::IsBitAND<T, BinaryOperation>::value)>;
 #endif
 
-// This type trait is used to detect if the group algorithm reduce() used with
-// operands of the type T and the operation Plus is available
-// for using in reduction. Note that this type trait is a subset of
+// This type trait is used to detect if the atomic operation BinaryOperation
+// used with operands of the type T is available for using in reduction, in
+// addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
+// has the atomic64 aspect. This type trait should only be used if the device
+// has the atomic64 aspect.  Note that this type trait is currently a subset of
 // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
-// using the reduce() algorithm to produce stable results across same type
-// devices.
+// using the reduce_over_group() algorithm to produce stable results across same
+// type devices.
+// TODO 32 bit floating point atomics are eventually expected to be supported by
+// the has_fast_atomics specialization. Once the reducer class is updated to
+// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
+// case should be removed here and replaced in IsReduOptForFastAtomicFetch.
 template <typename T, class BinaryOperation>
-using IsReduOptForFastFloatAtomicAdd =
+using IsReduOptForAtomic64Add =
 #ifdef SYCL_REDUCTION_DETERMINISTIC
     bool_constant<false>;
 #else
@@ -307,7 +313,7 @@ class reducer<T, BinaryOperation,
   /// Atomic ADD operation: for floating point using atomic_ref
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
-              IsReduOptForFastFloatAtomicAdd<T, _BinaryOperation>::value>
+              IsReduOptForAtomic64Add<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
 
     atomic_ref<T, sycl::ONEAPI::memory_order::relaxed,
@@ -358,8 +364,8 @@ class reduction_impl : private reduction_impl_base {
   using local_accessor_type =
       accessor<T, buffer_dim, access::mode::read_write, access::target::local>;
 
-  static constexpr bool has_atomic_add_float =
-      IsReduOptForFastFloatAtomicAdd<T, BinaryOperation>::value;
+  static constexpr bool has_atomic_add_float64 =
+      IsReduOptForAtomic64Add<T, BinaryOperation>::value;
   static constexpr bool has_fast_atomics =
       IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
   static constexpr bool has_fast_reduce =
@@ -667,8 +673,8 @@ class reduction_impl : private reduction_impl_base {
   /// accessor. Otherwise, create 1-element global buffer initialized with
   /// identity value and return an accessor to that buffer.
 
-  template <bool HasFastAtomics = has_fast_atomics>
-  std::enable_if_t<HasFastAtomics || has_atomic_add_float, rw_accessor_type>
+  template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
+  std::enable_if_t<HasFastAtomics, rw_accessor_type>
   getReadWriteAccessorToInitializedMem(handler &CGH) {
     if (!is_usm && !initializeToIdentity())
       return *MRWAcc;
@@ -1499,15 +1505,19 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
 }
 
 // Specialization for devices with the atomic64 aspect, which guarantees 64 (and
-// 32) bit floating point support for atomic add.
+// temporarily 32) bit floating point support for atomic add.
+// TODO 32 bit floating point atomics are eventually expected to be supported by
+// the has_fast_atomics specialization. Corresponding changes to
+// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
+// be made.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-std::enable_if_t<Reduction::has_atomic_add_float>
+std::enable_if_t<Reduction::has_atomic_add_float64>
 reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
                        const nd_range<Dims> &Range, Reduction &,
                        typename Reduction::rw_accessor_type Out) {
   using Name = typename get_reduction_main_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm,
-      Reduction::has_atomic_add_float,
+      Reduction::has_atomic_add_float64,
       typename Reduction::rw_accessor_type>::name;
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
     // Call user's function. Reducer.MValue gets initialized there.
@@ -1523,9 +1533,13 @@ reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
 }
 
 // Specialization for devices with the atomic64 aspect, which guarantees 64 (and
-// 32) bit floating point support for atomic add.
+// temporarily 32) bit floating point support for atomic add.
+// TODO 32 bit floating point atomics are eventually expected to be supported by
+// the has_fast_atomics specialization. Corresponding changes to
+// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
+// be made.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<Reduction::has_atomic_add_float>
+enable_if_t<Reduction::has_atomic_add_float64>
 reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
                    const nd_range<Dims> &Range, Reduction &Redu) {
 
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
@@ -211,7 +211,7 @@ using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::queue_impl;
 
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<Reduction::has_atomic_add_float>
+enable_if_t<Reduction::has_atomic_add_float64>
 reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
                    const nd_range<Dims> &Range, Reduction &Redu);
 
@@ -1396,7 +1396,7 @@ class __SYCL_EXPORT handler {
   // seem efficient.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
-  detail::enable_if_t<Reduction::has_atomic_add_float>
+  detail::enable_if_t<Reduction::has_atomic_add_float64>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
                _KERNELFUNCPARAM(KernelFunc)) {
 
@@ -1438,7 +1438,7 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
   detail::enable_if_t<!Reduction::has_fast_atomics &&
-                      !Reduction::has_atomic_add_float>
+                      !Reduction::has_atomic_add_float64>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
                _KERNELFUNCPARAM(KernelFunc)) {