[ESIMD] Fix slm_atomic_update() implementation for double type (#12337)

v-klochkov · web-flow · commit c7549f9166d5 · 2024-01-10T10:06:03.000-06:00
For double type the GenX intrinsic expects double vectors without
bit-casting them to integer types as for other types.

This fix enables FMAX/FMIN/FCMPXCHG slm_atomic_update() for double type.
It requires pretty new GPU driver.

Signed-off-by: Klochkov, Vyacheslav N &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -4089,15 +4089,22 @@ slm_atomic_update_impl(simd<uint32_t, N> offsets, simd<T, N> src0,
   constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
   constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
-  using MsgT = typename lsc_expand_type<T>::type;
   constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
-  simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
-  simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_slm_1<MsgT, IOp, cache_hint::none, cache_hint::none,
-                                AddressScale, ImmOffset, EDS, VS, Transposed,
-                                N>(pred.data(), offsets.data(),
-                                   Msg_data.data());
-  return lsc_format_ret<T>(Tmp);
+  if constexpr (std::is_same_v<T, double>) {
+    return __esimd_lsc_xatomic_slm_1<T, IOp, cache_hint::none, cache_hint::none,
+                                     AddressScale, ImmOffset, EDS, VS,
+                                     Transposed, N>(pred.data(), offsets.data(),
+                                                    src0.data());
+  } else {
+    using MsgT = typename lsc_expand_type<T>::type;
+    simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
+    simd<MsgT, N> Tmp =
+        __esimd_lsc_xatomic_slm_1<MsgT, IOp, cache_hint::none, cache_hint::none,
+                                  AddressScale, ImmOffset, EDS, VS, Transposed,
+                                  N>(pred.data(), offsets.data(),
+                                     Msg_data.data());
+    return lsc_format_ret<T>(Tmp);
+  }
 }
 
 /// SLM atomic.
@@ -4126,16 +4133,23 @@ __ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
   constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
   constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
-  using MsgT = typename lsc_expand_type<T>::type;
   constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
-  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
-  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
-  simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_slm_2<MsgT, IOp, cache_hint::none, cache_hint::none,
-                                AddressScale, ImmOffset, EDS, VS, Transposed,
-                                N>(pred.data(), offsets.data(),
-                                   Msg_data0.data(), Msg_data1.data());
-  return lsc_format_ret<T>(Tmp);
+  if constexpr (std::is_same_v<T, double>) {
+    return __esimd_lsc_xatomic_slm_2<T, IOp, cache_hint::none, cache_hint::none,
+                                     AddressScale, ImmOffset, EDS, VS,
+                                     Transposed, N>(pred.data(), offsets.data(),
+                                                    src0.data(), src1.data());
+  } else {
+    using MsgT = typename lsc_expand_type<T>::type;
+    simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
+    simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
+    simd<MsgT, N> Tmp =
+        __esimd_lsc_xatomic_slm_2<MsgT, IOp, cache_hint::none, cache_hint::none,
+                                  AddressScale, ImmOffset, EDS, VS, Transposed,
+                                  N>(pred.data(), offsets.data(),
+                                     Msg_data0.data(), Msg_data1.data());
+    return lsc_format_ret<T>(Tmp);
+  }
 }
 
 } // namespace detail
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
@@ -612,8 +612,7 @@ bool test_fp_types(queue q) {
 
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
-    // TODO: fmin/max for double does not pass validation likely due to
-    // a driver bug. fcmpwr is hanging.
+    // TODO: fmin/fmax/fcmpxchg for double requires a newer GPU driver.
     if constexpr (!std::is_same_v<Op<double, N>, ImplLSCFmax<double, N>> &&
                   !std::is_same_v<Op<double, N>, ImplLSCFmin<double, N>> &&
                   !std::is_same_v<Op<double, N>, ImplLSCFcmpwr<double, N>>) {