Skip to content

[SYCL][ESIMD] Move ESIMD APIs to sycl::ext::intel::experimental::esimd. #3695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions llvm/lib/SYCLLowerIR/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1243,7 +1243,9 @@ SmallPtrSet<Type *, 4> collectGenXVolatileTypes(Module &M) {
if (!PTy)
continue;
auto GTy = dyn_cast<StructType>(PTy->getPointerElementType());
if (!GTy || !GTy->getName().endswith("cl::sycl::INTEL::gpu::simd"))
// TODO FIXME relying on type name in LLVM IR is fragile, needs rework
if (!GTy || !GTy->getName().endswith(
"cl::sycl::ext::intel::experimental::esimd::simd"))
continue;
assert(GTy->getNumContainedTypes() == 1);
auto VTy = GTy->getContainedType(0);
Expand Down Expand Up @@ -1326,7 +1328,8 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F,

// process ESIMD builtins that go through special handling instead of
// the translation procedure
if (Name.startswith("N2cl4sycl5INTEL3gpu8slm_init")) {
// TODO FIXME slm_init should be made top-level __esimd_slm_init
if (Name.startswith("N2cl4sycl3ext5intel12experimental5esimd8slm_init")) {
// tag the kernel with meta-data SLMSize, and remove this builtin
translateSLMInit(*CI);
ESIMDToErases.push_back(CI);
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/SYCLLowerIR/esimd_lower_intrins.ll
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ define dso_local spir_func void @FUNC_29() {

define dso_local spir_kernel void @FUNC_30() {
; CHECK: define dso_local spir_kernel void @FUNC_30()
call spir_func void @_ZN2cl4sycl5INTEL3gpu8slm_initEj(i32 1023)
call spir_func void @_ZN2cl4sycl3ext5intel12experimental5esimd8slm_initEj(i32 1023)
ret void
; CHECK-NEXT: ret void
}
Expand Down Expand Up @@ -358,7 +358,7 @@ declare dso_local spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14
declare dso_local spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 %0, %opencl.image2d_wo_t addrspace(1)* %1, i32 %2, i32 %3, i32 %4, i32 %5, <32 x i32> %6)
declare dso_local spir_func <32 x i32> @_Z13__esimd_vloadIiLi32EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<32 x i32> addrspace(4)* %0)
declare dso_local spir_func void @_Z14__esimd_vstoreIfLi16EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<16 x float> addrspace(4)* %0, <16 x float> %1)
declare dso_local spir_func void @_ZN2cl4sycl5INTEL3gpu8slm_initEj(i32)
declare dso_local spir_func void @_ZN2cl4sycl3ext5intel12experimental5esimd8slm_initEj(i32)
declare dso_local spir_func <16 x i32> @_Z14__esimd_uudp4aIjjjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
declare dso_local spir_func <16 x i32> @_Z14__esimd_usdp4aIjiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
declare dso_local spir_func <16 x i32> @_Z14__esimd_sudp4aIijjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
Expand Down
17 changes: 12 additions & 5 deletions sycl/include/CL/sycl/INTEL/esimd/detail/esimd_host_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,20 @@

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {

namespace detail {
namespace half_impl {
class half;
} // namespace half_impl
} // namespace detail
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)

namespace INTEL {
namespace gpu {
__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace ext {
namespace intel {
namespace experimental {
namespace esimd {
namespace emu {
namespace detail {

Expand Down Expand Up @@ -466,8 +471,10 @@ template <> struct dwordtype<unsigned int> { static const bool value = true; };

} // namespace detail
} // namespace emu
} // namespace gpu
} // namespace INTEL
} // namespace esimd
} // namespace experimental
} // namespace intel
} // namespace ext
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)

Expand Down
104 changes: 56 additions & 48 deletions sycl/include/CL/sycl/INTEL/esimd/detail/esimd_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
#include <assert.h>
#include <cstdint>

#define __SIGD sycl::INTEL::gpu::detail
#define __SEIEED sycl::ext::intel::experimental::esimd::detail
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a future change, I'd like to see these macro names not be first letter of each of the namespaces. I'd like to have you choose a name like __ESIMD_NS that was at least a bit meaningful. But more importantly when ESIMD would move out of experimental, or otherwise change, I don't think you should have to change every single place these macros are used to designate the namespace. The changes to these added to a huge number of extra changes in this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Maybe it is better to do this change in this PR and retest to avoid disturbance in future. @pvchupin - what do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kbsmith-intel, how about:
__ESIMD_NS, __ESIMD_DETAIL_NS, __ESIMD_EMU_NS, __ESIMD_EMU_DETAIL_NS
?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those names look good to me.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are internal macro, right? I'd prefer it to go as a follow up change. Since we already committed tests we should commit the change asap.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, those are internal. OK, I will make this as a follow-up PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think followup would be fine also, and agree with Pavel that since tests have changed, getting source changes in ASAP is desired.

#define __SEIEE sycl::ext::intel::experimental::esimd

// \brief __esimd_rdregion: region access intrinsic.
//
Expand Down Expand Up @@ -64,13 +65,13 @@
//
template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth = 0>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SIGD::vector_type_t<T, M>
__esimd_rdregion(__SIGD::vector_type_t<T, N> Input, uint16_t Offset);
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, M>
__esimd_rdregion(__SEIEED::vector_type_t<T, N> Input, uint16_t Offset);

template <typename T, int N, int M, int ParentWidth = 0>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SIGD::vector_type_t<T, M>
__esimd_rdindirect(__SIGD::vector_type_t<T, N> Input,
__SIGD::vector_type_t<uint16_t, M> Offset);
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, M>
__esimd_rdindirect(__SEIEED::vector_type_t<T, N> Input,
__SEIEED::vector_type_t<uint16_t, M> Offset);

// __esimd_wrregion returns the updated vector with the region updated.
//
Expand Down Expand Up @@ -121,28 +122,30 @@ __esimd_rdindirect(__SIGD::vector_type_t<T, N> Input,
//
template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth = 0>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SIGD::vector_type_t<T, N>
__esimd_wrregion(__SIGD::vector_type_t<T, N> OldVal,
__SIGD::vector_type_t<T, M> NewVal, uint16_t Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask = 1);
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, N>
__esimd_wrregion(__SEIEED::vector_type_t<T, N> OldVal,
__SEIEED::vector_type_t<T, M> NewVal, uint16_t Offset,
__SEIEE::mask_type_t<M> Mask = 1);

template <typename T, int N, int M, int ParentWidth = 0>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SIGD::vector_type_t<T, N>
__esimd_wrindirect(__SIGD::vector_type_t<T, N> OldVal,
__SIGD::vector_type_t<T, M> NewVal,
__SIGD::vector_type_t<uint16_t, M> Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask = 1);
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, N>
__esimd_wrindirect(__SEIEED::vector_type_t<T, N> OldVal,
__SEIEED::vector_type_t<T, M> NewVal,
__SEIEED::vector_type_t<uint16_t, M> Offset,
__SEIEE::mask_type_t<M> Mask = 1);

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace INTEL {
namespace gpu {
namespace ext {
namespace intel {
namespace experimental {
namespace esimd {
namespace detail {

/// read from a basic region of a vector, return a vector
template <typename BT, int BN, typename RTy>
__SIGD::vector_type_t<typename RTy::element_type, RTy::length> ESIMD_INLINE
readRegion(const __SIGD::vector_type_t<BT, BN> &Base, RTy Region) {
__SEIEED::vector_type_t<typename RTy::element_type, RTy::length> ESIMD_INLINE
readRegion(const __SEIEED::vector_type_t<BT, BN> &Base, RTy Region) {
using ElemTy = typename RTy::element_type;
auto Base1 = bitcast<ElemTy, BT, BN>(Base);
constexpr int Bytes = BN * sizeof(BT);
Expand All @@ -163,8 +166,9 @@ readRegion(const __SIGD::vector_type_t<BT, BN> &Base, RTy Region) {

/// read from a nested region of a vector, return a vector
template <typename BT, int BN, typename T, typename U>
ESIMD_INLINE __SIGD::vector_type_t<typename T::element_type, T::length>
readRegion(const __SIGD::vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
ESIMD_INLINE __SEIEED::vector_type_t<typename T::element_type, T::length>
readRegion(const __SEIEED::vector_type_t<BT, BN> &Base,
std::pair<T, U> Region) {
// parent-region type
using PaTy = typename shape_type<U>::type;
constexpr int BN1 = PaTy::length;
Expand Down Expand Up @@ -206,8 +210,11 @@ readRegion(const __SIGD::vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
}

} // namespace detail
} // namespace gpu
} // namespace INTEL

} // namespace esimd
} // namespace experimental
} // namespace intel
} // namespace ext
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)

Expand All @@ -217,40 +224,40 @@ readRegion(const __SIGD::vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
// optimization on simd object
//
template <typename T, int N>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SIGD::vector_type_t<T, N>
__esimd_vload(const __SIGD::vector_type_t<T, N> *ptr);
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, N>
__esimd_vload(const __SEIEED::vector_type_t<T, N> *ptr);

// vstore
//
// map to the backend vstore intrinsic, used by compiler to control
// optimization on simd object
template <typename T, int N>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void
__esimd_vstore(__SIGD::vector_type_t<T, N> *ptr,
__SIGD::vector_type_t<T, N> vals);
__esimd_vstore(__SEIEED::vector_type_t<T, N> *ptr,
__SEIEED::vector_type_t<T, N> vals);

template <typename T, int N>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION uint16_t
__esimd_any(__SIGD::vector_type_t<T, N> src);
__esimd_any(__SEIEED::vector_type_t<T, N> src);

template <typename T, int N>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION uint16_t
__esimd_all(__SIGD::vector_type_t<T, N> src);
__esimd_all(__SEIEED::vector_type_t<T, N> src);

#ifndef __SYCL_DEVICE_ONLY__

// Implementations of ESIMD intrinsics for the SYCL host device
template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SIGD::vector_type_t<T, M>
__esimd_rdregion(__SIGD::vector_type_t<T, N> Input, uint16_t Offset) {
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, M>
__esimd_rdregion(__SEIEED::vector_type_t<T, N> Input, uint16_t Offset) {
uint16_t EltOffset = Offset / sizeof(T);
assert(Offset % sizeof(T) == 0);

int NumRows = M / Width;
assert(M % Width == 0);

__SIGD::vector_type_t<T, M> Result;
__SEIEED::vector_type_t<T, M> Result;
int Index = 0;
for (int i = 0; i < NumRows; ++i) {
for (int j = 0; j < Width; ++j) {
Expand All @@ -261,10 +268,10 @@ __esimd_rdregion(__SIGD::vector_type_t<T, N> Input, uint16_t Offset) {
}

template <typename T, int N, int M, int ParentWidth>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SIGD::vector_type_t<T, M>
__esimd_rdindirect(__SIGD::vector_type_t<T, N> Input,
__SIGD::vector_type_t<uint16_t, M> Offset) {
__SIGD::vector_type_t<T, M> Result;
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, M>
__esimd_rdindirect(__SEIEED::vector_type_t<T, N> Input,
__SEIEED::vector_type_t<uint16_t, M> Offset) {
__SEIEED::vector_type_t<T, M> Result;
for (int i = 0; i < M; ++i) {
uint16_t EltOffset = Offset[i] / sizeof(T);
assert(Offset[i] % sizeof(T) == 0);
Expand All @@ -276,17 +283,17 @@ __esimd_rdindirect(__SIGD::vector_type_t<T, N> Input,

template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SIGD::vector_type_t<T, N>
__esimd_wrregion(__SIGD::vector_type_t<T, N> OldVal,
__SIGD::vector_type_t<T, M> NewVal, uint16_t Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask) {
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, N>
__esimd_wrregion(__SEIEED::vector_type_t<T, N> OldVal,
__SEIEED::vector_type_t<T, M> NewVal, uint16_t Offset,
__SEIEE::mask_type_t<M> Mask) {
uint16_t EltOffset = Offset / sizeof(T);
assert(Offset % sizeof(T) == 0);

int NumRows = M / Width;
assert(M % Width == 0);

__SIGD::vector_type_t<T, N> Result = OldVal;
__SEIEED::vector_type_t<T, N> Result = OldVal;
int Index = 0;
for (int i = 0; i < NumRows; ++i) {
for (int j = 0; j < Width; ++j) {
Expand All @@ -299,12 +306,12 @@ __esimd_wrregion(__SIGD::vector_type_t<T, N> OldVal,
}

template <typename T, int N, int M, int ParentWidth>
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SIGD::vector_type_t<T, N>
__esimd_wrindirect(__SIGD::vector_type_t<T, N> OldVal,
__SIGD::vector_type_t<T, M> NewVal,
__SIGD::vector_type_t<uint16_t, M> Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask) {
__SIGD::vector_type_t<T, N> Result = OldVal;
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t<T, N>
__esimd_wrindirect(__SEIEED::vector_type_t<T, N> OldVal,
__SEIEED::vector_type_t<T, M> NewVal,
__SEIEED::vector_type_t<uint16_t, M> Offset,
__SEIEE::mask_type_t<M> Mask) {
__SEIEED::vector_type_t<T, N> Result = OldVal;
for (int i = 0; i < M; ++i) {
if (Mask[i]) {
uint16_t EltOffset = Offset[i] / sizeof(T);
Expand All @@ -318,4 +325,5 @@ __esimd_wrindirect(__SIGD::vector_type_t<T, N> OldVal,

#endif // __SYCL_DEVICE_ONLY__

#undef __SIGD
#undef __SEIEE
#undef __SEIEED
Loading