Skip to content

Commit 44136bd

Browse files
kbobrovsalexbatashevFznamznon
authored
[SYCL][ESIMD] Setup compilation pipeline for ESIMD (#2134)
- Add passes in BackendUtil to enable ESIMD back-end code generation. - Add more ESIMD tests - Minor fix in SPIRV intrinsic translation - Fix some existing ESIMD tests Authors: Gang Chen <[email protected]> Denis Bakhvalov <[email protected]> Konstantin S Bobrovsky <[email protected]> Wei Pan, Shahab Layeghi. Co-authored-by: Alexander Batashev <[email protected]> Co-authored-by: Mariya Podchishchaeva <[email protected]>
1 parent a08674e commit 44136bd

File tree

11 files changed

+538
-4
lines changed

11 files changed

+538
-4
lines changed

clang/lib/CodeGen/BackendUtil.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "llvm/Passes/PassBuilder.h"
4242
#include "llvm/Passes/PassPlugin.h"
4343
#include "llvm/Passes/StandardInstrumentations.h"
44+
#include "llvm/SYCLLowerIR/LowerESIMD.h"
4445
#include "llvm/Support/BuryPointer.h"
4546
#include "llvm/Support/CommandLine.h"
4647
#include "llvm/Support/MemoryBuffer.h"
@@ -786,6 +787,25 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM,
786787

787788
PMBuilder.populateFunctionPassManager(FPM);
788789
PMBuilder.populateModulePassManager(MPM);
790+
791+
// Customize the tail of the module passes list for the ESIMD extension.
792+
if (LangOpts.SYCLIsDevice && LangOpts.SYCLExplicitSIMD &&
793+
CodeGenOpts.OptimizationLevel != 0) {
794+
MPM.add(createESIMDLowerVecArgPass());
795+
MPM.add(createESIMDLowerLoadStorePass());
796+
MPM.add(createSROAPass());
797+
MPM.add(createEarlyCSEPass(true));
798+
MPM.add(createInstructionCombiningPass());
799+
MPM.add(createDeadCodeEliminationPass());
800+
MPM.add(createFunctionInliningPass(
801+
CodeGenOpts.OptimizationLevel, CodeGenOpts.OptimizeSize,
802+
(!CodeGenOpts.SampleProfileFile.empty() &&
803+
CodeGenOpts.PrepareForThinLTO)));
804+
MPM.add(createSROAPass());
805+
MPM.add(createEarlyCSEPass(true));
806+
MPM.add(createInstructionCombiningPass());
807+
MPM.add(createDeadCodeEliminationPass());
808+
}
789809
}
790810

791811
static void setCommandLineOpts(const CodeGenOptions &CodeGenOpts) {
@@ -880,6 +900,11 @@ void EmitAssemblyHelper::EmitAssembly(BackendAction Action,
880900
PerFunctionPasses.add(
881901
createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));
882902

903+
// ESIMD extension always requires lowering of certain IR constructs, such as
904+
// ESIMD C++ intrinsics, as the last FE step.
905+
if (LangOpts.SYCLIsDevice && LangOpts.SYCLExplicitSIMD)
906+
PerModulePasses.add(createSYCLLowerESIMDPass());
907+
883908
CreatePasses(PerModulePasses, PerFunctionPasses);
884909

885910
legacy::PassManager CodeGenPasses;

clang/lib/Frontend/CompilerInvocation.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -816,10 +816,14 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
816816
Args.getLastArg(OPT_emit_llvm_uselists, OPT_no_emit_llvm_uselists))
817817
Opts.EmitLLVMUseLists = A->getOption().getID() == OPT_emit_llvm_uselists;
818818

819+
// ESIMD GPU Back-end requires optimized IR
820+
bool IsSyclESIMD = Args.hasFlag(options::OPT_fsycl_esimd,
821+
options::OPT_fno_sycl_esimd, false);
822+
819823
Opts.DisableLLVMPasses =
820824
Args.hasArg(OPT_disable_llvm_passes) ||
821825
(Args.hasArg(OPT_fsycl_is_device) && Triple.isSPIR() &&
822-
!Args.hasArg(OPT_fsycl_enable_optimizations));
826+
!Args.hasArg(OPT_fsycl_enable_optimizations) && !IsSyclESIMD);
823827
Opts.DisableLifetimeMarkers = Args.hasArg(OPT_disable_lifetimemarkers);
824828

825829
const llvm::Triple::ArchType DebugEntryValueArchs[] = {

clang/test/CodeGenSYCL/esimd-private-global.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ __attribute__((opencl_private)) __attribute__((register_num(17))) int vc;
99

1010
SYCL_EXTERNAL void init_vc(int x) {
1111
vc = x;
12-
// CHECK: store i32 %0, i32* @vc
12+
// CHECK: store i32 %{{[0-9a-zA-Z_]+}}, i32* @vc
1313
}
1414
// CHECK: attributes #0 = {{.*"VCByteOffset"="17".*"VCVolatile"}}

llvm/lib/SYCLLowerIR/LowerESIMD.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,7 @@ translateSpirvIntrinsic(CallInst *CI, StringRef SpirvIntrName,
820820
auto translateSpirvIntr = [&SpirvIntrName, &ESIMDToErases,
821821
CI](StringRef SpvIName, auto TranslateFunc) {
822822
if (SpirvIntrName.consume_front(SpvIName)) {
823-
Value *TranslatedV = TranslateFunc(*CI, SpirvIntrName);
823+
Value *TranslatedV = TranslateFunc(*CI, SpirvIntrName.substr(1, 1));
824824
CI->replaceAllUsesWith(TranslatedV);
825825
ESIMDToErases.push_back(CI);
826826
}

llvm/test/SYCLLowerIR/esimd_lower_intrins.ll

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,17 @@ define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 {
170170
; CHECK-NEXT: ret void
171171
}
172172

173+
define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 {
174+
; CHECK: define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1
175+
%call = call spir_func i64 @_Z27__spirv_LocalInvocationId_xv()
176+
; CHECK-NEXT: %call.esimd = call <3 x i32> @llvm.genx.local.id.v3i32()
177+
; CHECK-NEXT: %local_id.x = extractelement <3 x i32> %call.esimd, i32 0
178+
; CHECK-NEXT: %local_id.x.cast.ty = zext i32 %local_id.x to i64
179+
ret void
180+
; CHECK-NEXT: ret void
181+
}
182+
183+
declare dso_local spir_func i64 @_Z27__spirv_LocalInvocationId_xv()
173184
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1)
174185
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2)
175186
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic2ILN2cm3gen14CmAtomicOpTypeE7EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_S7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i32> %2, <32 x i16> %3)

llvm/test/SYCLLowerIR/esimd_subroutine.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ entry:
4949
}
5050

5151
; Function Attrs: norecurse nounwind
52-
; CHECK: define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(<16 x i32> addrspace(4)* [[OLDARG0:%[a-zA-Z0-9_]*]], <16 x i32> addrspace(4)* [[OLDARG1:%[a-zA-Z0-9_]*]]) unnamed_addr {{.+}}
52+
; CHECK: define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(<16 x i32> addrspace(4)* [[OLDARG0:%[a-zA-Z0-9_]*]], <16 x i32> addrspace(4)*{{.*}} [[OLDARG1:%[a-zA-Z0-9_]*]]) unnamed_addr {{.+}}
5353
define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(%class._ZTS4simdIiLi16EE.simd addrspace(4)* %this, %class._ZTS4simdIiLi16EE.simd addrspace(4)* align 64 dereferenceable(64) %other) unnamed_addr #0 comdat align 2 {
5454
entry:
5555
; CHECK: [[NEWARG1:%[a-zA-Z0-9_]*]] = bitcast <16 x i32> addrspace(4)* [[OLDARG1]] to {{.+}}

sycl/test/basic_tests/esimd/vadd.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// TODO ESIMD enable host device under -fsycl
2+
// RUN: %clangxx -I %sycl_include %s -o %t.out -lsycl
3+
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
4+
5+
#include <CL/sycl.hpp>
6+
#include <CL/sycl/intel/esimd.hpp>
7+
#include <iostream>
8+
9+
using namespace cl::sycl;
10+
11+
class ESIMDSelector : public device_selector {
12+
// Require GPU device unless HOST is requested in SYCL_DEVICE_TYPE env
13+
virtual int operator()(const device &device) const {
14+
if (const char *dev_type = getenv("SYCL_DEVICE_TYPE")) {
15+
if (!strcmp(dev_type, "GPU"))
16+
return device.is_gpu() ? 1000 : -1;
17+
if (!strcmp(dev_type, "HOST"))
18+
return device.is_host() ? 1000 : -1;
19+
std::cerr << "Supported 'SYCL_DEVICE_TYPE' env var values are 'GPU' and "
20+
"'HOST', '"
21+
<< dev_type << "' is not.\n";
22+
return -1;
23+
}
24+
// If "SYCL_DEVICE_TYPE" not defined, only allow gpu device
25+
return device.is_gpu() ? 1000 : -1;
26+
}
27+
};
28+
29+
auto exception_handler = [](exception_list l) {
30+
for (auto ep : l) {
31+
try {
32+
std::rethrow_exception(ep);
33+
} catch (cl::sycl::exception &e0) {
34+
std::cout << "sycl::exception: " << e0.what() << std::endl;
35+
} catch (std::exception &e) {
36+
std::cout << "std::exception: " << e.what() << std::endl;
37+
} catch (...) {
38+
std::cout << "generic exception\n";
39+
}
40+
}
41+
};
42+
43+
int main(void) {
44+
constexpr unsigned Size = 256;
45+
constexpr unsigned VL = 32;
46+
constexpr unsigned GroupSize = 2;
47+
48+
int A[Size];
49+
int B[Size];
50+
int C[Size] = {};
51+
52+
for (unsigned i = 0; i < Size; ++i) {
53+
A[i] = B[i] = i;
54+
}
55+
56+
{
57+
cl::sycl::buffer<int, 1> bufA(A, Size);
58+
cl::sycl::buffer<int, 1> bufB(B, Size);
59+
cl::sycl::buffer<int, 1> bufC(C, Size);
60+
61+
// We need that many task groups
62+
cl::sycl::range<1> GroupRange{Size / VL};
63+
64+
// We need that many tasks in each group
65+
cl::sycl::range<1> TaskRange{GroupSize};
66+
67+
cl::sycl::nd_range<1> Range{GroupRange, TaskRange};
68+
69+
queue q(ESIMDSelector{}, exception_handler);
70+
q.submit([&](cl::sycl::handler &cgh) {
71+
auto accA = bufA.get_access<cl::sycl::access::mode::read>(cgh);
72+
auto accB = bufB.get_access<cl::sycl::access::mode::read>(cgh);
73+
auto accC = bufC.get_access<cl::sycl::access::mode::write>(cgh);
74+
75+
cgh.parallel_for<class Test>(
76+
Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
77+
using namespace sycl::intel::gpu;
78+
auto pA = accA.get_pointer().get();
79+
auto pB = accB.get_pointer().get();
80+
auto pC = accC.get_pointer().get();
81+
82+
int i = ndi.get_global_id(0);
83+
constexpr int ESIZE = sizeof(int);
84+
simd<uint32_t, VL> offsets(0, ESIZE);
85+
86+
simd<int, VL> va = gather<int, VL>(pA + i * VL, offsets);
87+
simd<int, VL> vb = block_load<int, VL>(pB + i * VL);
88+
simd<int, VL> vc = va + vb;
89+
90+
block_store<int, VL>(pC + i * VL, vc);
91+
});
92+
});
93+
94+
for (unsigned i = 0; i < Size; ++i) {
95+
if (A[i] + B[i] != C[i]) {
96+
std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
97+
<< " + " << B[i] << "\n";
98+
return 1;
99+
}
100+
}
101+
}
102+
103+
std::cout << "Passed\n";
104+
return 0;
105+
}

sycl/test/esimd/glob.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// RUN: %clangxx -fsycl -fsycl-explicit-simd -c -fsycl-device-only -Xclang -emit-llvm %s -o - | \
2+
// RUN: FileCheck %s
3+
4+
// This test checks that globals with register attribute are allowed in ESIMD
5+
// mode, can be accessed in functions and correct LLVM IR is generated
6+
// (including translation of the register attribute)
7+
8+
#include <CL/sycl.hpp>
9+
#include <CL/sycl/intel/esimd.hpp>
10+
#include <iostream>
11+
12+
using namespace cl::sycl;
13+
using namespace sycl::intel::gpu;
14+
15+
constexpr unsigned VL = 16;
16+
17+
ESIMD_PRIVATE ESIMD_REGISTER(17) simd<int, VL> vc;
18+
// CHECK-DAG: @vc = {{.+}} <16 x i32> zeroinitializer, align 64 #0
19+
// CHECK-DAG: attributes #0 = { {{.*}}"VCByteOffset"="17" "VCGlobalVariable" "VCVolatile"{{.*}} }
20+
21+
ESIMD_PRIVATE ESIMD_REGISTER(17 + VL) simd<int, VL> vc1;
22+
// CHECK-DAG: @vc1 = {{.+}} <16 x i32> zeroinitializer, align 64 #1
23+
// CHECK-DAG: attributes #1 = { {{.*}}"VCByteOffset"="33" "VCGlobalVariable" "VCVolatile"{{.*}} }
24+
25+
SYCL_EXTERNAL ESIMD_NOINLINE void init_vc(int x) {
26+
vc1 = vc + 1;
27+
vc = x;
28+
}

sycl/test/esimd/hw_compile.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Basic ESIMD test which checks that ESIMD invocation syntax can get compiled.
2+
// RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -c %s -o %t.bc
3+
4+
#include <CL/sycl.hpp>
5+
#include <CL/sycl/intel/esimd.hpp>
6+
#include <iostream>
7+
8+
int main(void) {
9+
constexpr unsigned Size = 4;
10+
int A[Size] = {1, 2, 3, 4};
11+
int B[Size] = {1, 2, 3, 4};
12+
int C[Size];
13+
14+
{
15+
cl::sycl::range<1> UnitRange{1};
16+
cl::sycl::buffer<int, 1> bufA(A, UnitRange);
17+
cl::sycl::buffer<int, 1> bufB(B, UnitRange);
18+
cl::sycl::buffer<int, 1> bufC(C, UnitRange);
19+
20+
cl::sycl::queue().submit([&](cl::sycl::handler &cgh) {
21+
auto accA = bufA.get_access<cl::sycl::access::mode::read>(cgh);
22+
auto accB = bufB.get_access<cl::sycl::access::mode::read>(cgh);
23+
auto accC = bufC.get_access<cl::sycl::access::mode::write>(cgh);
24+
25+
cgh.parallel_for<class Test>(UnitRange * UnitRange,
26+
[=](sycl::id<1> i) SYCL_ESIMD_KERNEL {
27+
// those operations below would normally be
28+
// represented as a single vector operation
29+
// through ESIMD vector
30+
accC[i + 0] = accA[i + 0] + accB[i + 0];
31+
accC[i + 1] = accA[i + 1] + accB[i + 1];
32+
accC[i + 2] = accA[i + 2] + accB[i + 2];
33+
accC[i + 3] = accA[i + 3] + accB[i + 3];
34+
});
35+
});
36+
}
37+
38+
return 0;
39+
}

sycl/test/esimd/intrins_trans.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
// RUN: %clangxx -O0 -fsycl -fsycl-explicit-simd -fsycl-device-only -Xclang -emit-llvm %s -o - | \
2+
// RUN: FileCheck %s
3+
4+
// Checks ESIMD intrinsic translation.
5+
// NOTE: must be run in -O0, as optimizer optimizes away some of the code
6+
7+
#include <CL/sycl.hpp>
8+
#include <CL/sycl/detail/image_ocl_types.hpp>
9+
#include <CL/sycl/intel/esimd.hpp>
10+
11+
using namespace sycl::intel::gpu;
12+
13+
ESIMD_PRIVATE vector_type_t<int, 32> vc;
14+
ESIMD_PRIVATE ESIMD_REGISTER(192) simd<int, 16> vg;
15+
16+
SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo();
17+
18+
class EsimdFunctor {
19+
public:
20+
void operator()() __attribute__((sycl_explicit_simd)) { foo(); }
21+
};
22+
23+
template <typename name, typename Func>
24+
__attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
25+
kernelFunc();
26+
}
27+
28+
void bar() {
29+
EsimdFunctor esimdf;
30+
kernel<class kernel_esimd>(esimdf);
31+
}
32+
33+
SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo() {
34+
// CHECK-LABEL: @_Z3foov
35+
constexpr int VL = 32;
36+
uint32_t *ptr = 0;
37+
38+
int x = 0, y = 0, z = 0;
39+
40+
simd<uint32_t, VL> v1(0, x + z);
41+
simd<uint64_t, VL> offsets(0, y);
42+
simd<uintptr_t, VL> v_addr(reinterpret_cast<uintptr_t>(ptr));
43+
simd<ushort, VL> pred;
44+
v_addr += offsets;
45+
46+
__esimd_flat_atomic0<EsimdAtomicOpType::ATOMIC_INC, uint32_t, VL>(
47+
v_addr.data(), pred.data());
48+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.inc.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)
49+
50+
__esimd_flat_atomic1<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, VL>(
51+
v_addr.data(), v1, pred.data());
52+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.add.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)
53+
__esimd_flat_atomic2<EsimdAtomicOpType::ATOMIC_CMPXCHG, uint32_t, VL>(
54+
v_addr.data(), v1, v1, pred.data());
55+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.cmpxchg.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)
56+
57+
uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
58+
simd<uint32_t, VL> v00 =
59+
__esimd_flat_block_read_unaligned<uint32_t, VL>(addr);
60+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.block.ld.unaligned.v32i32(i64 %{{[0-9a-zA-Z_.]+}})
61+
__esimd_flat_block_write<uint32_t, VL>(addr, v00.data());
62+
// CHECK: call void @llvm.genx.svm.block.st.v32i32(i64 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}})
63+
64+
simd<uint32_t, VL> v01 =
65+
__esimd_flat_read<uint32_t, VL>(v_addr.data(), 0, pred.data());
66+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.gather.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)
67+
68+
__esimd_flat_write<uint32_t, VL>(v_addr.data(), v01.data(), 0, pred.data());
69+
// CHECK: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32i32(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}})
70+
71+
simd<short, 16> mina(0, 1);
72+
simd<short, 16> minc(5);
73+
minc = __esimd_smin<short, 16>(mina.data(), minc.data());
74+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i16> @llvm.genx.smin.v16i16.v16i16(<16 x i16> %{{[0-9a-zA-Z_.]+}}, <16 x i16> %{{[0-9a-zA-Z_.]+}})
75+
76+
simd<float, 1> diva(2.f);
77+
simd<float, 1> divb(1.f);
78+
diva = __esimd_div_ieee<1>(diva.data(), divb.data());
79+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <1 x float> @llvm.genx.ieee.div.v1f32(<1 x float> %{{[0-9a-zA-Z_.]+}}, <1 x float> %{{[0-9a-zA-Z_.]+}})
80+
81+
simd<float, 16> a(0.1f);
82+
simd<float, 8> b = __esimd_rdregion<float, 16, 8, 0, 8, 1>(a.data(), 0);
83+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x float> @llvm.genx.rdregionf.v8f32.v16f32.i16(<16 x float> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 1, i16 0, i32 0)
84+
85+
simd<float, 16> c(0.0f);
86+
87+
using PH = cl::sycl::access::placeholder;
88+
89+
cl::sycl::accessor<cl::sycl::cl_int4, 2, cl::sycl::access::mode::read,
90+
cl::sycl::access::target::image, PH::false_t>
91+
pA;
92+
cl::sycl::accessor<cl::sycl::cl_int4, 2, cl::sycl::access::mode::write,
93+
cl::sycl::access::target::image, PH::false_t>
94+
pB;
95+
96+
auto d = __esimd_wrregion<float, 16 /*ret size*/, 8 /*write size*/,
97+
0 /*vstride*/, 8 /*row width*/, 1 /*hstride*/>(
98+
c.data() /*dst*/, b.data() /*src*/, 0 /*offset*/);
99+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x float> @llvm.genx.wrregionf.v16f32.v8f32.i16.v8i1(<16 x float> %{{[0-9a-zA-Z_.]+}}, <8 x float> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 1, i16 0, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
100+
101+
simd<int, 32> va;
102+
va = media_block_load<int, 4, 8>(pA, x, y);
103+
// CHECK: %[[SI0:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_ro_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
104+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.media.ld.v32i32(i32 0, i32 %[[SI0]], i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}})
105+
106+
simd<int, 32> vb = va + 1;
107+
media_block_store<int, 4, 8>(pB, x, y, vb);
108+
// CHECK: %[[SI2:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_wo_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
109+
// CHECK: call void @llvm.genx.media.st.v32i32(i32 0, i32 %[[SI2]], i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}})
110+
111+
auto ee = __esimd_vload<int, 16>((vector_type_t<int, 16> *)(&vg));
112+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* {{.*}})
113+
__esimd_vstore<int, 32>(&vc, va.data());
114+
// CHECK: store <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> addrspace(4)* {{.*}}
115+
116+
return d;
117+
}

0 commit comments

Comments
 (0)