Skip to content

[ESIMD] Reduce number of bit-casts generated for lsc_block_load/store operations #8385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 21, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,6 @@ void __esimd_emu_lsc_offset_write(
std::conditional_t<DS ==
__ESIMD_ENS::lsc_data_size::u16u32,
uint16_t, void>>>>>>;

for (int OffsetIdx = 0; OffsetIdx < N; OffsetIdx += 1) {
if (Pred[OffsetIdx] == 0) {
// Skip input vector elements correpsonding to
Expand All @@ -420,7 +419,12 @@ void __esimd_emu_lsc_offset_write(
VecIdx += vectorIndexIncrement<N, _Transposed>()) {

if ((ByteDistance >= 0) && (ByteDistance < BufByteWidth)) {
*((StoreType *)(WriteBase + ByteDistance)) = vals[VecIdx];
if constexpr (std::is_floating_point<Ty>::value) {
*((StoreType *)(WriteBase + ByteDistance)) =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't this branch be used unconditionally: i.e. for if-then and if-else?

sycl::bit_cast<StoreType>(vals[VecIdx]);
} else {
*((StoreType *)(WriteBase + ByteDistance)) = vals[VecIdx];
}
}
}
}
Expand Down Expand Up @@ -1177,7 +1181,12 @@ __ESIMD_INTRIN void __esimd_lsc_store_stateless(
for (int ChanelIdx = 0, VecIdx = AddrIdx; ChanelIdx < ChanlCount;
ChanelIdx += 1, ByteDistance += rawAddressIncrement<Ty, DS>(),
VecIdx += vectorIndexIncrement<N, _Transposed>()) {
*((StoreType *)(BaseAddr + ByteDistance)) = vals[VecIdx];
if constexpr (std::is_floating_point<Ty>::value) {
*((StoreType *)(BaseAddr + ByteDistance)) =
sycl::bit_cast<StoreType>(vals[VecIdx]);
} else {
*((StoreType *)(BaseAddr + ByteDistance)) = vals[VecIdx];
}
}
}
}
Expand Down
132 changes: 55 additions & 77 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -590,8 +590,9 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
detail::check_lsc_vector_size<NElts / SmallIntFactor>();

// Prepare template arguments for the call of intrinsic.
using LoadElemT =
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>;
using LoadElemT = std::conditional_t<
std::is_floating_point<T>::value, T,
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>>;
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr auto _DS = FDS == lsc_data_size::u64 ? FDS : lsc_data_size::u32;
Expand Down Expand Up @@ -650,8 +651,9 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred,
detail::check_lsc_vector_size<NElts / SmallIntFactor>();

// Prepare template arguments for the call of intrinsic.
using LoadElemT =
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>;
using LoadElemT = std::conditional_t<
std::is_floating_point<T>::value, T,
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>>;
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr auto _DS = FDS == lsc_data_size::u64 ? FDS : lsc_data_size::u32;
Expand Down Expand Up @@ -714,8 +716,9 @@ lsc_block_load(AccessorTy acc, uint32_t offset,
detail::check_lsc_vector_size<NElts / SmallIntFactor>();

// Prepare template arguments for the call of intrinsic.
using LoadElemT =
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>;
using LoadElemT = std::conditional_t<
std::is_floating_point<T>::value, T,
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>>;
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr auto _DS = FDS == lsc_data_size::u64 ? FDS : lsc_data_size::u32;
Expand Down Expand Up @@ -779,8 +782,9 @@ lsc_block_load(AccessorTy acc, uint32_t offset, __ESIMD_NS::simd_mask<1> pred,
detail::check_lsc_vector_size<NElts / SmallIntFactor>();

// Prepare template arguments for the call of intrinsic.
using LoadElemT =
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>;
using LoadElemT = std::conditional_t<
std::is_floating_point<T>::value, T,
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>>;
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr auto _DS = FDS == lsc_data_size::u64 ? FDS : lsc_data_size::u32;
Expand Down Expand Up @@ -1206,43 +1210,32 @@ __ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
__ESIMD_NS::simd_mask<1> pred = 1) {
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
constexpr int SmallIntFactor =
(FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts > 0 && NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed load");
detail::check_lsc_vector_size<NElts / SmallIntFactor>();

// Prepare template arguments for the call of intrinsic.
using StoreElemT = std::conditional_t<
std::is_floating_point<T>::value, T,
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>>;
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr auto _DS = FDS == lsc_data_size::u64 ? FDS : lsc_data_size::u32;
constexpr auto _VS = detail::to_lsc_vector_size<NElts / SmallIntFactor>();
constexpr auto _Transposed = detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
constexpr int SmallIntFactor =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed store");
detail::check_lsc_vector_size<NElts / SmallIntFactor>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
if constexpr (SmallIntFactor == 1) {
if constexpr (_DS == lsc_data_size::u32) {
__esimd_lsc_store_stateless<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
_DS, _VS, _Transposed, N>(
pred.data(), addrs.data(),
sycl::bit_cast<__ESIMD_DNS::vector_type_t<uint32_t, NElts>>(
vals.data()));
} else {
__esimd_lsc_store_stateless<uint64_t, L1H, L3H, _AddressScale, _ImmOffset,
_DS, _VS, _Transposed, N>(
pred.data(), addrs.data(),
sycl::bit_cast<__ESIMD_DNS::vector_type_t<uint64_t, NElts>>(
vals.data()));
}
} else {
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> tmp = sycl::bit_cast<
__ESIMD_DNS::vector_type_t<uint32_t, NElts / SmallIntFactor>>(
vals.data());

__esimd_lsc_store_stateless<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), addrs.data(), tmp.data());
}
__ESIMD_NS::simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);

__esimd_lsc_store_stateless<StoreElemT, L1H, L3H, _AddressScale, _ImmOffset,
_DS, _VS, _Transposed, N>(
pred.data(), Addrs.data(),
sycl::bit_cast<
__ESIMD_DNS::vector_type_t<StoreElemT, NElts / SmallIntFactor>>(
vals.data()));
}

/// Accessor-based transposed scatter with 1 channel.
Expand Down Expand Up @@ -1279,48 +1272,33 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
#else
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
constexpr int SmallIntFactor =
(FDS == lsc_data_size::u16) ? 2 : (FDS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts > 0 && NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed load");
detail::check_lsc_vector_size<NElts / SmallIntFactor>();

// Prepare template arguments for the call of intrinsic.
using StoreElemT = std::conditional_t<
std::is_floating_point<T>::value, T,
std::conditional_t<FDS == lsc_data_size::u64, uint64_t, uint32_t>>;
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr auto _DS = FDS == lsc_data_size::u64 ? FDS : lsc_data_size::u32;
constexpr auto _VS = detail::to_lsc_vector_size<NElts / SmallIntFactor>();
constexpr auto _Transposed = detail::lsc_data_order::transpose;
constexpr int N = 1;

__ESIMD_NS::simd<uint32_t, N> offsets = offset;
auto si = __ESIMD_NS::get_surface_index(acc);
constexpr int SmallIntFactor =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);

detail::check_lsc_vector_size<NElts / SmallIntFactor>();
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed store");
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
if constexpr (SmallIntFactor > 1) {
__esimd_lsc_store_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), offsets.data(),
sycl::bit_cast<
__ESIMD_DNS::vector_type_t<uint32_t, NElts / SmallIntFactor>>(
vals.data()),
si);
} else {
if constexpr (_DS == lsc_data_size::u32) {
__esimd_lsc_store_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(
pred.data(), offsets.data(),
sycl::bit_cast<__ESIMD_DNS::vector_type_t<uint32_t, NElts>>(
vals.data()),
si);
} else {
__esimd_lsc_store_bti<uint64_t, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(
pred.data(), offsets.data(),
sycl::bit_cast<__ESIMD_DNS::vector_type_t<uint64_t, NElts>>(
vals.data()),
si);
}
}
__esimd_lsc_store_bti<StoreElemT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(
pred.data(), offsets.data(),
sycl::bit_cast<
__ESIMD_DNS::vector_type_t<StoreElemT, NElts / SmallIntFactor>>(
vals.data()),
si);
#endif
}

Expand Down