intel · kbobrovs · Sep 22, 2022 · Sep 7, 2022 · Sep 8, 2022 · Sep 8, 2022
@@ -495,22 +495,35 @@ template <typename T, uint8_t NElts = 1,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
 __ESIMD_API __ESIMD_NS::simd<T, NElts>
 lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
-  detail::check_lsc_vector_size<NElts>();
   detail::check_lsc_data_size<T, DS>();
   detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
   constexpr uint16_t _AddressScale = 1;
   constexpr int _ImmOffset = 0;
   constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
-  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
-                "Transposed load is supported only for data size u32 or u64");
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
   constexpr detail::lsc_data_order _Transposed =
       detail::lsc_data_order::transpose;
   constexpr int N = 1;
   __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
-  return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
-                                    _VS, _Transposed, N>(pred.data(),
-                                                         addrs.data());
+  if constexpr (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64) {
+    detail::check_lsc_vector_size<NElts>();
+    constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+    return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset,
+                                      _DS, _VS, _Transposed, N>(pred.data(),
+                                                                addrs.data());
+  } else if constexpr (_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8) {
+    constexpr int NElemsInDword = _DS == lsc_data_size::u16 ? 2 : 4;
+    static_assert(NElts % NElemsInDword == 0,
+                  "Number of elements is not supported by Transposed load");
+    detail::check_lsc_vector_size<NElts / NElemsInDword>();
+    constexpr detail::lsc_vector_size _VS =
+        detail::to_lsc_vector_size<NElts / NElemsInDword>();
+
+    __ESIMD_NS::simd<uint32_t, NElts / NElemsInDword> result =
+        __esimd_lsc_load_stateless<uint32_t, L1H, L3H, _AddressScale,
+                                   _ImmOffset, lsc_data_size::u32, _VS,
+                                   _Transposed, N>(pred.data(), addrs.data());
+    return result.template bit_cast_view<T>();
+  }
 }
 
 /// Accessor-based transposed gather with 1 channel.
@@ -545,22 +558,38 @@ lsc_block_load(AccessorTy acc, uint32_t offset,
   return lsc_block_load<T, NElts, DS, L1H, L3H>(
       __ESIMD_DNS::accessorToPointer<T>(acc, offset), pred);
 #else
-  detail::check_lsc_vector_size<NElts>();
   detail::check_lsc_data_size<T, DS>();
   detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
   constexpr uint16_t _AddressScale = 1;
   constexpr int _ImmOffset = 0;
   constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
-  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
-                "Transposed load is supported only for data size u32 or u64");
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
   constexpr detail::lsc_data_order _Transposed =
       detail::lsc_data_order::transpose;
   constexpr int N = 1;
   __ESIMD_NS::simd<uint32_t, N> offsets = offset;
   auto si = __ESIMD_NS::get_surface_index(acc);
-  return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
-                              _Transposed, N>(pred.data(), offsets.data(), si);
+
+  if constexpr (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64) {
+    detail::check_lsc_vector_size<NElts>();
+    constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+    return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
+                                _VS, _Transposed, N>(pred.data(),
+                                                     offsets.data(), si);
+  } else if constexpr (_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8) {
+    constexpr int NElemsInDword = _DS == lsc_data_size::u16 ? 2 : 4;
+    static_assert(NElts % NElemsInDword == 0,
+                  "Number of elements is not supported by Transposed load");
+
+    detail::check_lsc_vector_size<NElts / NElemsInDword>();
+    constexpr detail::lsc_vector_size _VS =
+        detail::to_lsc_vector_size<NElts / NElemsInDword>();
+
+    __ESIMD_NS::simd<uint32_t, NElts / NElemsInDword> result =
+        __esimd_lsc_load_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
+                             lsc_data_size::u32, _VS, _Transposed, N>(
+            pred.data(), offsets.data(), si);
+    return result.template bit_cast_view<T>();
+  }
 #endif
 }
 
@@ -627,6 +656,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
   constexpr uint16_t _AddressScale = 1;
   constexpr int _ImmOffset = 0;
   constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
+
   static_assert(
       _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
       "Transposed prefetch is supported only for data size u32 or u64");
@@ -635,6 +665,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
       detail::lsc_data_order::transpose;
   constexpr int N = 1;
   __ESIMD_NS::simd_mask<N> pred = 1;
+
   __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
   __esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
                                  _VS, _Transposed, N>(pred.data(),
@@ -918,22 +949,35 @@ template <typename T, uint8_t NElts = 1,
           cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
 __ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
                                  __ESIMD_NS::simd_mask<1> pred = 1) {
-  detail::check_lsc_vector_size<NElts>();
   detail::check_lsc_data_size<T, DS>();
   detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
   constexpr uint16_t _AddressScale = 1;
   constexpr int _ImmOffset = 0;
   constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
-  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
-                "Transposed store is supported only for data size u32 or u64");
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
   constexpr detail::lsc_data_order _Transposed =
       detail::lsc_data_order::transpose;
   constexpr int N = 1;
   __ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
-  __esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
-                              _Transposed, N>(pred.data(), addrs.data(),
-                                              vals.data());
+  if constexpr (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64) {
+    detail::check_lsc_vector_size<NElts>();
+    constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+    __esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
+                                _VS, _Transposed, N>(pred.data(), addrs.data(),
+                                                     vals.data());
+  } else if constexpr (_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8) {
+    constexpr int NElemsInDword = _DS == lsc_data_size::u16 ? 2 : 4;
+    static_assert(NElts % NElemsInDword == 0,
+                  "Number of elements is not supported by Transposed store");
+    detail::check_lsc_vector_size<NElts / NElemsInDword>();
+    constexpr detail::lsc_vector_size _VS =
+        detail::to_lsc_vector_size<NElts / NElemsInDword>();
+    __ESIMD_NS::simd<uint32_t, NElts / NElemsInDword> tmp =
+        vals.template bit_cast_view<uint32_t>();
+
+    __esimd_lsc_store_stateless<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
+                                lsc_data_size::u32, _VS, _Transposed, N>(
+        pred.data(), addrs.data(), tmp.data());
+  }
 }
 
 /// Accessor-based transposed scatter with 1 channel.
@@ -967,23 +1011,38 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
   lsc_block_store<T, NElts, DS, L1H>(
       __ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred);
 #else
-  detail::check_lsc_vector_size<NElts>();
   detail::check_lsc_data_size<T, DS>();
-  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
+  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
   constexpr uint16_t _AddressScale = 1;
   constexpr int _ImmOffset = 0;
   constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
-  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
-                "Transposed store is supported only for data size u32 or u64");
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
   constexpr detail::lsc_data_order _Transposed =
       detail::lsc_data_order::transpose;
   constexpr int N = 1;
+
   __ESIMD_NS::simd<uint32_t, N> offsets = offset;
   auto si = __ESIMD_NS::get_surface_index(acc);
-  __esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
-                        _Transposed, N>(pred.data(), offsets.data(),
-                                        vals.data(), si);
+
+  if constexpr (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64) {
+    detail::check_lsc_vector_size<NElts>();
+    constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+    __esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
+                          _Transposed, N>(pred.data(), offsets.data(),
+                                          vals.data(), si);
+  } else if constexpr (_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8) {
+    constexpr int NElemsInDword = _DS == lsc_data_size::u16 ? 2 : 4;
+    static_assert(NElts % NElemsInDword == 0,
+                  "Number of elements is not supported by Transposed store");
+    detail::check_lsc_vector_size<NElts / NElemsInDword>();
+    constexpr detail::lsc_vector_size _VS =
+        detail::to_lsc_vector_size<NElts / NElemsInDword>();
+    __ESIMD_NS::simd<uint32_t, NElts / NElemsInDword> tmp =
+        vals.template bit_cast_view<uint32_t>();
+
+    __esimd_lsc_store_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
+                          lsc_data_size::u32, _VS, _Transposed, N>(
+        pred.data(), offsets.data(), tmp.data(), si);
+  }
 #endif
 }