Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 40aff02

Browse files
committed
Merge remote-tracking branch 'upstream/intel' into filter
2 parents e2b17da + bb842e3 commit 40aff02

File tree

89 files changed

+1555
-302
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

89 files changed

+1555
-302
lines changed

SYCL/AOT/gpu.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
// REQUIRES: ocloc, gpu
1010
// UNSUPPORTED: cuda
1111
// CUDA is not compatible with SPIR.
12-
12+
//
13+
// The test is failing with GPU RT 30.0.100.9667
14+
// XFAIL: windows
15+
//
1316
// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device *" %S/Inputs/aot.cpp -o %t.out
1417
// RUN: %GPU_RUN_PLACEHOLDER %t.out

SYCL/AOT/multiple-devices.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,13 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9-
// REQUIRES: opencl-aot, ocloc, aoc, cpu, gpu, accelerator
9+
// REQUIRES: opencl-aot, ocloc, aoc, cpu, gpu, accelerator, llvm-link, llvm-spirv
1010
// UNSUPPORTED: cuda
1111
// CUDA is not compatible with SPIR.
1212

13+
// The test is failing with GPU RT 30.0.100.9667
14+
// XFAIL: windows
15+
1316
// 1-command compilation case
1417
// Targeting CPU, GPU, FPGA
1518
// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice,spir64_gen-unknown-unknown-sycldevice,spir64_fpga-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device *" %S/Inputs/aot.cpp -o %t_all.out
@@ -18,11 +21,22 @@
1821
// RUN: %GPU_RUN_PLACEHOLDER %t_all.out
1922
// RUN: %ACC_RUN_PLACEHOLDER %t_all.out
2023

24+
// FIXME: Change the behavior when proper automation for assert support is
25+
// introduced. For the time being, AOT flow can't detect if specific extension
26+
// is available for this or that device. The automation to be introduced is to
27+
// query native binary generator on specific features.
28+
//
2129
// Produce object file, spirv, device images to combine these differently
2230
// at link-time, thus testing various AOT-compiled images configurations
2331
// RUN: %clangxx -fsycl %S/Inputs/aot.cpp -c -o %t.o
2432
// RUN: %clangxx -fsycl -fsycl-link-targets=spir64-unknown-unknown-sycldevice %t.o -o %t.spv
2533
// AOT-compile device binary images
34+
// Neither of AOT tools can compile several files, hence, here is this
35+
// workaround
36+
// RUN: %llvm_spirv -r %sycl_libs_dir/libsycl-fallback-cassert.spv -o=%T/fallback-cassert.bc
37+
// RUN: %llvm_spirv -r %t.spv -o=%t.bc
38+
// RUN: %llvm_link %t.bc %T/fallback-cassert.bc -o=%t2.bc
39+
// RUN: %llvm_spirv %t2.bc -o=%t.spv
2640
// RUN: opencl-aot %t.spv -o=%t_cpu.ir --device=cpu
2741
// RUN: ocloc -file %t.spv -spirv_input -output %t_gen.out -output_no_suffix -device cfl
2842
// RUN: aoc %t.spv -o %t_fpga.aocx -sycl -dep-files=%t.d

SYCL/Basic/device_code_dae.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44
// CUDA does not support SPIR-V.
55
// RUN: %clangxx -fsycl-device-only -Xclang -fenable-sycl-dae -Xclang -fsycl-int-header=int_header.h %s -c -o device_code.bc -Wno-sycl-strict
66
// RUN: %clangxx %cxx_std_optionc++17 %include_option int_header.h %debug_option -c %s -o host_code.o %sycl_options -Wno-sycl-strict
7-
// RUN: llvm-link -o=linked_device_code.bc device_code.bc
7+
// FIXME Added explicit offline linking step here until online-linking support
8+
// is fixed
9+
// RUN: %llvm_spirv -r %sycl_libs_dir/libsycl-fallback-cassert.spv -o=%T/fallback-cassert.bc
10+
// RUN: llvm-link -o=linked_device_code.bc device_code.bc %T/fallback-cassert.bc
811
// RUN: sycl-post-link -emit-param-info linked_device_code.bc
912
// RUN: llvm-spirv -o linked_device_code.spv linked_device_code.bc
1013
// RUN: echo -e -n "[Code|Properties]\nlinked_device_code.spv|linked_device_code_0.prop" > table.txt

SYCL/Basic/fpga_tests/fpga_aocx.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,20 @@
1111
/// E2E test for AOCX creation/use/run for FPGA
1212
// Produce an archive with device (AOCX) image. To avoid appending objects to
1313
// leftover archives, remove one if exists.
14+
// FIXME Disabled use of devicelib by assert feature until the 2-step build gets
15+
// fixed. For the time being when 2-step build is employed and there's a call to
16+
// devicelib function from kernel, the binary image gets corrupted. Due to
17+
// fallback assert implementation adds a kernel with appropriate call, we have
18+
// it disabled for this test.
1419
// RUN: rm %t_image.a || true
15-
// RUN: %clangxx -fsycl -fintelfpga -fsycl-link=image %S/Inputs/fpga_device.cpp -o %t_image.a
20+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT=1 -fsycl -fintelfpga -fsycl-link=image %S/Inputs/fpga_device.cpp -o %t_image.a
1621
// Produce a host object
17-
// RUN: %clangxx -fsycl -fintelfpga %S/Inputs/fpga_host.cpp -c -o %t.o
22+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT=1 -fsycl -fintelfpga %S/Inputs/fpga_host.cpp -c -o %t.o
1823

1924
// AOCX with source
20-
// RUN: %clangxx -fsycl -fintelfpga %S/Inputs/fpga_host.cpp %t_image.a -o %t_aocx_src.out
25+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT=1 -fsycl -fintelfpga %S/Inputs/fpga_host.cpp %t_image.a -o %t_aocx_src.out
2126
// AOCX with object
22-
// RUN: %clangxx -fsycl -fintelfpga %t.o %t_image.a -o %t_aocx_obj.out
27+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT=1 -fsycl -fintelfpga %t.o %t_image.a -o %t_aocx_obj.out
2328
//
2429
// RUN: %ACC_RUN_PLACEHOLDER %t_aocx_src.out
2530
// RUN: %ACC_RUN_PLACEHOLDER %t_aocx_obj.out

SYCL/Basic/fpga_tests/fpga_aocx_win.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,20 @@
1212
/// E2E test for AOCX creation/use/run for FPGA
1313
// Produce an archive with device (AOCX) image. To avoid appending objects to
1414
// leftover archives, remove one if exists.
15+
// FIXME Disabled use of devicelib by assert feature until the 2-step build gets
16+
// fixed. For the time being when 2-step build is employed and there's a call to
17+
// devicelib function from kernel, the binary image gets corrupted. Due to
18+
// fallback assert implementation adds a kernel with appropriate call, we have
19+
// it disabled for this test.
1520
// RUN: rm %t_image.a || true
16-
// RUN: %clangxx -fsycl -fintelfpga -fsycl-link=image %S/Inputs/fpga_device.cpp -o %t_image.lib
21+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT=1 -fsycl -fintelfpga -fsycl-link=image %S/Inputs/fpga_device.cpp -o %t_image.lib
1722
// Produce a host object
18-
// RUN: %clangxx -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp -c -o %t.obj
23+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT=1 -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp -c -o %t.obj
1924

2025
// AOCX with source
21-
// RUN: %clangxx -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp %t_image.lib -o %t_aocx_src.out
26+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT=1 -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp %t_image.lib -o %t_aocx_src.out
2227
// AOCX with object
23-
// RUN: %clangxx -fsycl -fintelfpga %t.obj %t_image.lib -o %t_aocx_obj.out
28+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT=1 -fsycl -fintelfpga %t.obj %t_image.lib -o %t_aocx_obj.out
2429
//
2530
// RUN: %ACC_RUN_PLACEHOLDER %t_aocx_src.out
2631
// RUN: %ACC_RUN_PLACEHOLDER %t_aocx_obj.out

SYCL/Basic/kernel_bundle/kernel_bundle_api.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out
1+
// Disable fallback assert here until online-support is fixed.
2+
// Use of per-kernel device code split and linking the bundle with all images
3+
// involved leads to multiple definition of AssertHappened structure due each
4+
// device image is statically linked against fallback libdevice.
5+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT=1 -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out
26
// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
37
//
48
// -fsycl-device-code-split is not supported for cuda

SYCL/Basic/multisource.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
// XFAIL: cuda
10+
911
// Separate kernel sources and host code sources
1012
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -c -o %t.kernel.o %s -DINIT_KERNEL -DCALC_KERNEL
1113
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -c -o %t.main.o %s -DMAIN_APP

SYCL/Basic/parallel_for_range.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,6 @@
1313

1414
using namespace cl::sycl;
1515

16-
[[cl::reqd_work_group_size(4, 4, 4)]] void reqd_wg_size_helper() {
17-
// do nothing
18-
}
19-
2016
int main() {
2117
auto AsyncHandler = [](exception_list ES) {
2218
for (auto &E : ES) {
@@ -48,8 +44,8 @@ int main() {
4844
try {
4945
Q.submit([&](handler &CGH) {
5046
CGH.parallel_for<class ReqdWGSizeNegativeA>(
51-
nd_range<3>(range<3>(16, 16, 16), range<3>(8, 8, 8)),
52-
[=](nd_item<3>) { reqd_wg_size_helper(); });
47+
nd_range<3>(range<3>(16, 16, 16), range<3>(8, 8, 8)), [=
48+
](nd_item<3>) [[sycl::reqd_work_group_size(4, 4, 4)]]{});
5349
});
5450
Q.wait_and_throw();
5551
std::cerr
@@ -87,8 +83,8 @@ int main() {
8783
try {
8884
Q.submit([&](handler &CGH) {
8985
CGH.parallel_for<class ReqdWGSizePositiveA>(
90-
nd_range<3>(range<3>(8, 8, 8), range<3>(4, 4, 4)),
91-
[=](nd_item<3>) { reqd_wg_size_helper(); });
86+
nd_range<3>(range<3>(8, 8, 8), range<3>(4, 4, 4)), [=
87+
](nd_item<3>) [[sycl::reqd_work_group_size(4, 4, 4)]]{});
9288
});
9389
Q.wait_and_throw();
9490
} catch (nd_range_error &E) {

SYCL/Basic/queue/release.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@ int main() {
1313
return 0;
1414
}
1515

16-
//CHECK: ---> piEnqueueKernelLaunch(
17-
//CHECK: ---> piQueueRelease(
18-
//CHECK: ---> piEventRelease(
19-
//CHECK: ---> piContextRelease(
20-
//CHECK: ---> piKernelRelease(
21-
//CHECK: ---> piProgramRelease(
22-
//CHECK: ---> piDeviceRelease(
16+
// CHECK: ---> piEnqueueKernelLaunch(
17+
// FIXME the order of these 2 varies between plugins due to a Level Zero
18+
// specific queue workaround.
19+
// CHECK-DAG: ---> piEventRelease(
20+
// CHECK-DAG: ---> piQueueRelease(
21+
// CHECK: ---> piContextRelease(
22+
// CHECK: ---> piKernelRelease(
23+
// CHECK: ---> piProgramRelease(
24+
// CHECK: ---> piDeviceRelease(

SYCL/Basic/subdevice_pi.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ static bool check_separate(device dev, buffer<int, 1> buf,
6161
// CHECK-SEPARATE: ---> piQueueCreate
6262
// CHECK-SEPARATE: ---> piMemBufferCreate
6363
// CHECK-SEPARATE: ---> piEnqueueKernelLaunch
64-
// CHECK-SEPARATE: ---> piEventsWait
64+
// CHECK-SEPARATE: ---> piQueueFinish
6565

6666
log_pi("Test sub device 1");
6767
{
@@ -78,7 +78,7 @@ static bool check_separate(device dev, buffer<int, 1> buf,
7878
// CHECK-SEPARATE: ---> piEnqueueMemBufferWrite
7979
//
8080
// CHECK-SEPARATE: ---> piEnqueueKernelLaunch
81-
// CHECK-SEPARATE: ---> piEventsWait
81+
// CHECK-SEPARATE: ---> piQueueFinish
8282

8383
return true;
8484
}
@@ -113,7 +113,7 @@ static bool check_shared_context(device dev, buffer<int, 1> buf,
113113
// see --implicit-check-not above.
114114
//
115115
// CHECK-SHARED: ---> piEnqueueKernelLaunch
116-
// CHECK-SHARED: ---> piEventsWait
116+
// CHECK-SHARED: ---> piQueueFinish
117117

118118
log_pi("Test sub device 1");
119119
{
@@ -123,7 +123,7 @@ static bool check_shared_context(device dev, buffer<int, 1> buf,
123123
// CHECK-SHARED: Test sub device 1
124124
// CHECK-SHARED: ---> piQueueCreate
125125
// CHECK-SHARED: ---> piEnqueueKernelLaunch
126-
// CHECK-SHARED: ---> piEventsWait
126+
// CHECK-SHARED: ---> piQueueFinish
127127
// CHECK-SHARED: ---> piEnqueueMemBufferRead
128128

129129
return true;
@@ -162,7 +162,7 @@ static bool check_fused_context(device dev, buffer<int, 1> buf,
162162
// *and* the root device): see --implicit-check-not above.
163163
//
164164
// CHECK-FUSED: ---> piEnqueueKernelLaunch
165-
// CHECK-FUSED: ---> piEventsWait
165+
// CHECK-FUSED: ---> piQueueFinish
166166

167167
log_pi("Test sub device 0");
168168
{
@@ -172,7 +172,7 @@ static bool check_fused_context(device dev, buffer<int, 1> buf,
172172
// CHECK-FUSED: Test sub device 0
173173
// CHECK-FUSED: ---> piQueueCreate
174174
// CHECK-FUSED: ---> piEnqueueKernelLaunch
175-
// CHECK-FUSED: ---> piEventsWait
175+
// CHECK-FUSED: ---> piQueueFinish
176176

177177
log_pi("Test sub device 1");
178178
{
@@ -182,7 +182,7 @@ static bool check_fused_context(device dev, buffer<int, 1> buf,
182182
// CHECK-FUSED: Test sub device 1
183183
// CHECK-FUSED: ---> piQueueCreate
184184
// CHECK-FUSED: ---> piEnqueueKernelLaunch
185-
// CHECK-FUSED: ---> piEventsWait
185+
// CHECK-FUSED: ---> piQueueFinish
186186
// CHECK-FUSED: ---> piEnqueueMemBufferRead
187187

188188
return true;

SYCL/Config/kernel_from_file.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
// UNSUPPORTED: cuda
22
// CUDA does not support SPIR-V.
33

4-
// RUN: %clangxx %cxx_std_optionc++17 -fsycl-device-only -fno-sycl-use-bitcode -Xclang -fsycl-int-header=%t.h -c %s -o %t.spv -Xclang -verify-ignore-unexpected=note,warning -Wno-sycl-strict
5-
// RUN: %clangxx %cxx_std_optionc++17 %include_option %t.h %s -o %t.out %sycl_options -Xclang -verify-ignore-unexpected=note,warning
4+
// FIXME Disabled fallback assert as it'll require either online linking or
5+
// explicit offline linking step here
6+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT %cxx_std_optionc++17 -fsycl-device-only -fno-sycl-use-bitcode -Xclang -fsycl-int-header=%t.h -c %s -o %t.spv -Xclang -verify-ignore-unexpected=note,warning -Wno-sycl-strict
7+
// RUN: %clangxx -DSYCL_DISABLE_FALLBACK_ASSERT %cxx_std_optionc++17 %include_option %t.h %s -o %t.out %sycl_options -Xclang -verify-ignore-unexpected=note,warning
68
// RUN: %BE_RUN_PLACEHOLDER env SYCL_USE_KERNEL_SPV=%t.spv %t.out | FileCheck %s
79
// CHECK: Passed
810

SYCL/Config/program_link.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
using namespace cl::sycl;
1313
class DUMMY {
1414
public:
15-
void operator()(item<1>){};
15+
void operator()(item<1>) const {};
1616
};
1717

1818
int main(void) {
@@ -22,6 +22,7 @@ int main(void) {
2222
return 0;
2323
}
2424
context c(p);
25+
queue Q(c, s);
2526
program prog1(c);
2627
prog1.compile_with_kernel_type<DUMMY>();
2728
prog1.link("-cl-finite-math-only");
@@ -40,5 +41,11 @@ int main(void) {
4041
// CHECK-IS-OPT-DISABLE-NOT: -cl-mad-enable
4142
assert(prog2.get_compile_options() == "-cl-mad-enable" &&
4243
"program::get_compile_options() output is wrong");
44+
45+
// enforce SYCL toolchain to emit device image but no enqueue in run-time
46+
if (false) {
47+
Q.submit([&](handler &CGH) { CGH.parallel_for(range<1>{2}, DUMMY{}); });
48+
}
49+
4350
return 0;
44-
}
51+
}

SYCL/DeviceCodeSplit/aot-gpu.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
// UNSUPPORTED: cuda
33
// CUDA does neither support device code splitting nor SPIR.
44
//
5+
// The test is failing with GPU RT 30.0.100.9667
6+
// XFAIL: windows
7+
//
58
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_source \
69
// RUN: -fsycl-targets=spir64_gen-unknown-unknown-sycldevice \
710
// RUN: -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice \

SYCL/DeviceLib/assert-aot.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
// REQUIRES: opencl-aot, cpu, linux
1+
// REQUIRES: opencl-aot, cpu, linux, UNSUPPORTED
2+
// FIXME re-enable after intel/llvm#3767 is merged
23

34
// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice %S/assert.cpp -o %t.aot.out
4-
// RUN: %CPU_RUN_PLACEHOLDER %t.aot.out >%t.aot.msg
5+
// RUN: %CPU_RUN_PLACEHOLDER EXPECTED_SIGNAL=SIGABRT SHOULD_CRASH=1 %t.aot.out 2>%t.aot.msg
56
// RUN: FileCheck %S/assert.cpp --input-file %t.aot.msg --check-prefixes=CHECK-MESSAGE

SYCL/DeviceLib/built-ins/nan.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out
66
// RUN: %ACC_RUN_PLACEHOLDER %t.out
77

8+
// XFAIL: cuda
9+
810
#include <CL/sycl.hpp>
911

1012
#include <cassert>

SYCL/ESIMD/BitonicSortK.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,8 @@ bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
167167
simd<uint32_t, BASE_SZ> B;
168168
#pragma unroll
169169
for (int i = 0; i < BASE_SZ; i += 32) {
170-
auto MA = A.select<32, 1>(i).format<uint32_t, 4, 8>();
171-
auto MB = B.select<32, 1>(i).format<uint32_t, 4, 8>();
170+
auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
171+
auto MB = B.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
172172
MB.select<4, 1, 4, 1>(0, 0) = MA.select<4, 1, 4, 1>(0, 4);
173173
MB.select<4, 1, 4, 1>(0, 4) = MA.select<4, 1, 4, 1>(0, 0);
174174
B.select<32, 1>(i).merge(A.select<32, 1>(i),
@@ -196,8 +196,8 @@ bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
196196
simd<uint32_t, BASE_SZ> B;
197197
#pragma unroll
198198
for (int i = 0; i < BASE_SZ; i += 32) {
199-
auto MB = B.select<32, 1>(i).format<long long, 4, 4>();
200-
auto MA = A.select<32, 1>(i).format<long long, 4, 4>();
199+
auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
200+
auto MA = A.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
201201
MB.select<4, 1, 2, 2>(0, 0) = MA.select<4, 1, 2, 2>(0, 1);
202202
MB.select<4, 1, 2, 2>(0, 1) = MA.select<4, 1, 2, 2>(0, 0);
203203
B.select<32, 1>(i).merge(A.select<32, 1>(i),
@@ -326,8 +326,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
326326
simd<ushort, 32> flip16(init_mask16);
327327
#pragma unroll
328328
for (int i = 0; i < BASE_SZ; i += 32) {
329-
auto MA = A.select<32, 1>(i).format<uint32_t, 4, 8>();
330-
auto MB = B.select<32, 1>(i).format<uint32_t, 4, 8>();
329+
auto MA = A.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
330+
auto MB = B.select<32, 1>(i).bit_cast_view<uint32_t, 4, 8>();
331331
MA.select<4, 1, 4, 1>(0, 0) = MB.select<4, 1, 4, 1>(0, 4);
332332
MA.select<4, 1, 4, 1>(0, 4) = MB.select<4, 1, 4, 1>(0, 0);
333333
bool dir_up = (((offset + i) >> (m + 1)) & 1) == 0;
@@ -346,8 +346,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
346346
simd<ushort, 32> flip18(init_mask18);
347347
#pragma unroll
348348
for (int i = 0; i < BASE_SZ; i += 32) {
349-
auto MB = B.select<32, 1>(i).format<long long, 4, 4>();
350-
auto MA = A.select<32, 1>(i).format<long long, 4, 4>();
349+
auto MB = B.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
350+
auto MA = A.select<32, 1>(i).bit_cast_view<long long, 4, 4>();
351351

352352
MB.select<4, 1, 2, 2>(0, 0) = MA.select<4, 1, 2, 2>(0, 1);
353353
MB.select<4, 1, 2, 2>(0, 1) = MA.select<4, 1, 2, 2>(0, 0);

0 commit comments

Comments
 (0)