[ESIMD] Fix perf regression caused by assumed align in block_load(usm) (#11850)

v-klochkov · web-flow · commit c6362a010912 · 2023-11-09T21:56:21.000-06:00
The element-size address alignment is valid from correctness point of
view, but using 1-byte and 2-byte alignment implicitly causes
performance regression for block_load(const int8_t *, ...) and
block_load(const int16_t *, ...) because GPU BE have to generate slower
GATHER instead of more efficient BLOCK-LOAD. Without this fix block-load
causes up to 44% performance slow-down on some apps that used
block_load() with alignment assumptions used before block_load(usm, ...,
compile_time_props) was implemented.

The reasoning for the expected/assumed alignment from element-size to
4-bytes for byte- and word-vectors is such:
   The idea of block_load() call (opposing to gather() call) is to have
   efficient block-load, and thus the assumed alignment is such that
   allows to generate block-load. This is a bit more tricky for user
   but that is how block_load/store API always worked before: block-load
   had restrictions that needed to be honored.
   To be on safer side, user can always pass the guaranteed alignment.

---------

Signed-off-by: Klochkov, Vyacheslav N &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -695,9 +695,11 @@ block_store(Tx *addr, simd<Tx, N> vals, Flags) {
 /// the cache_hint::none value is assumed by default.
 ///
 /// Alignment: If \p props does not specify the 'alignment' property, then
-/// the default assumed alignment is the minimally required element-size
-/// alignment. Note that additional/temporary restrictions may apply
-/// (see Restrictions below).
+/// the default assumed alignment is 4-bytes for 4-byte or smaller elements
+/// and 8-bytes for 8-byte elements. The address may be element-size aligned
+/// even for byte- and word-elements, but in such case the smaller alignment
+/// property must explicitly passed to this function. Extra restrictions
+/// may be in place - see Restrictions/R1 below.
 ///
 /// Restrictions - cache hint imposed - temporary:
 /// If L1 or L2 cache hint is passed, then:
@@ -727,21 +729,16 @@ block_load(const T *ptr, PropertyListT props = {}) {
                 "L3 cache hint is reserved. The old/experimental L3 LSC cache "
                 "hint is cache_level::L2 now.");
 
+  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
   if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none) {
     detail::check_cache_hint<detail::cache_action::load, L1Hint, L2Hint>();
-    constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
-    constexpr size_t Alignment =
-        detail::getPropertyValue<PropertyListT, alignment_key>(
-            DefaultAlignment);
 
     simd_mask<1> Mask = 1;
     return detail::block_load_impl<T, N, L1Hint, L2Hint>(
         ptr, Mask, overaligned_tag<Alignment>{});
   } else {
-    // If the alignment property is not passed, then assume the pointer
-    // is element-aligned.
-    constexpr size_t Alignment =
-        detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
     return block_load<T, N>(ptr, overaligned_tag<Alignment>{});
   }
 }
@@ -763,9 +760,11 @@ block_load(const T *ptr, PropertyListT props = {}) {
 /// the cache_hint::none value is assumed by default.
 ///
 /// Alignment: If \p props does not specify the 'alignment' property, then
-/// the default assumed alignment is the minimally required element-size
-/// alignment. Note that additional/temporary restrictions may apply
-/// (see Restrictions below).
+/// the default assumed alignment is 4-bytes for 4-byte or smaller elements
+/// and 8-bytes for 8-byte elements. The address may be element-size aligned
+/// even for byte- and word-elements, but in such case the smaller alignment
+/// property must explicitly passed to this function. Extra restrictions
+/// may be in place - see Restrictions/R1 below.
 ///
 /// Restrictions - cache hint imposed - temporary:
 /// If L1 or L2 cache hint is passed, then:
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
@@ -64,6 +64,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &acc,
   simd<float, N> pass_thru = 1;
   simd<int, N> pass_thrui = 1;
   const int *ptri = reinterpret_cast<const int *>(ptrf);
+  const int8_t *ptrb = reinterpret_cast<const int8_t *>(ptrf);
 
   // CHECK: call <4 x float> @llvm.genx.lsc.load.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
   auto d1 = block_load<float, N>(ptrf, props_a);
@@ -187,4 +188,12 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &acc,
   simd<double, 4> pass_thrud4 = 2.0;
   auto lacc_bl6 = block_load<double, 4>(local_acc, byte_offset32, mask,
                                         pass_thrud4, props_a);
+
+  // Check the default/assumed alignment when the alignment property is
+  // not specified explicitly.
+  // TODO: Extend this kind of tests:
+  //   {usm, acc, local_acc, slm} x {byte, word, dword, qword}.
+
+  // CHECK: load <16 x i8>, ptr addrspace(4) {{[^)]+}}, align 4
+  auto align_check1 = block_load<int8_t, 16>(ptrb);
 }