Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit f2bec11

Browse files
committed
Merge remote-tracking branch 'upstream/intel' into filter
2 parents 6ec4410 + d2abf74 commit f2bec11

38 files changed

+1731
-152
lines changed

.github/CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
* @vladimirlaz
1+
* @vladimirlaz @romanovvlad @bader
22

33
# AOT compilation
44
SYCL/AOT @AGindinson @dm-vodopyanov @AlexeySachkov @romanovvlad

SYCL/AOT/gpu.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,5 @@
1010
// UNSUPPORTED: cuda
1111
// CUDA is not compatible with SPIR.
1212
//
13-
// The test is failing with GPU RT 30.0.100.9667
14-
// XFAIL: windows
15-
//
1613
// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device *" %S/Inputs/aot.cpp -o %t.out
1714
// RUN: %GPU_RUN_PLACEHOLDER %t.out

SYCL/AOT/multiple-devices.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,6 @@
1010
// UNSUPPORTED: cuda
1111
// CUDA is not compatible with SPIR.
1212

13-
// The test is failing with GPU RT 30.0.100.9667
14-
// XFAIL: windows
15-
1613
// 1-command compilation case
1714
// Targeting CPU, GPU, FPGA
1815
// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice,spir64_gen-unknown-unknown-sycldevice,spir64_fpga-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device *" %S/Inputs/aot.cpp -o %t_all.out

SYCL/Basic/multisource.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9-
// XFAIL: cuda
10-
119
// Separate kernel sources and host code sources
1210
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -c -o %t.kernel.o %s -DINIT_KERNEL -DCALC_KERNEL
1311
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -c -o %t.main.o %s -DMAIN_APP

SYCL/Basic/reqd_work_group_size.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
// XFAIL: cuda
2-
// The negative test fails on CUDA. It's not clear whether the CUDA backend
3-
// respects the reqd_work_group_size attribute.
4-
51
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
62
// RUN: %CPU_RUN_PLACEHOLDER %t.out
73
// RUN: %GPU_RUN_PLACEHOLDER %t.out

SYCL/Basic/subsubdevice.cpp

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %HOST_RUN_PLACEHOLDER %t.out
3+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6+
7+
//==------------ subdevice.cpp - SYCL subdevice basic test -----------------==//
8+
//
9+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
10+
// See https://llvm.org/LICENSE.txt for license information.
11+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
12+
//
13+
//===----------------------------------------------------------------------===//
14+
15+
#include <CL/sycl.hpp>
16+
#include <algorithm>
17+
#include <cassert>
18+
#include <iostream>
19+
#include <utility>
20+
21+
using namespace cl::sycl;
22+
23+
int main() {
24+
try {
25+
auto devices = device::get_devices();
26+
for (const auto &dev : devices) {
27+
// TODO: implement subdevices creation for host device
28+
if (dev.is_host())
29+
continue;
30+
31+
assert(dev.get_info<info::device::partition_type_property>() ==
32+
info::partition_property::no_partition);
33+
34+
size_t MaxSubDevices =
35+
dev.get_info<info::device::partition_max_sub_devices>();
36+
37+
if (MaxSubDevices == 0)
38+
continue;
39+
40+
try {
41+
auto SubDevicesEq =
42+
dev.create_sub_devices<info::partition_property::partition_equally>(
43+
1);
44+
assert(SubDevicesEq.size() == MaxSubDevices &&
45+
"Requested 1 compute unit in each subdevice, expected maximum "
46+
"number of subdevices in output");
47+
std::cout << "Created " << SubDevicesEq.size()
48+
<< " subdevices using equal partition scheme" << std::endl;
49+
50+
assert(
51+
SubDevicesEq[0].get_info<info::device::partition_type_property>() ==
52+
info::partition_property::partition_equally);
53+
54+
assert(SubDevicesEq[0].get_info<info::device::parent_device>().get() ==
55+
dev.get());
56+
} catch (feature_not_supported) {
57+
// okay skip it
58+
}
59+
60+
try {
61+
vector_class<size_t> Counts(MaxSubDevices, 1);
62+
auto SubDevicesByCount = dev.create_sub_devices<
63+
info::partition_property::partition_by_counts>(Counts);
64+
assert(SubDevicesByCount.size() == MaxSubDevices &&
65+
"Maximum number of subdevices was requested with 1 compute unit "
66+
"on each");
67+
std::cout << "Created " << SubDevicesByCount.size()
68+
<< " subdevices using partition by counts scheme."
69+
<< std::endl;
70+
assert(SubDevicesByCount[0]
71+
.get_info<info::device::partition_type_property>() ==
72+
info::partition_property::partition_by_counts);
73+
} catch (feature_not_supported) {
74+
// okay skip it
75+
}
76+
77+
try {
78+
auto SubDevicesDomainNuma = dev.create_sub_devices<
79+
info::partition_property::partition_by_affinity_domain>(
80+
info::partition_affinity_domain::numa);
81+
std::cout
82+
<< "Created " << SubDevicesDomainNuma.size()
83+
<< " subdevices using partition by numa affinity domain scheme."
84+
<< std::endl;
85+
86+
auto SubSubDevicesDomainNuma =
87+
SubDevicesDomainNuma[0]
88+
.create_sub_devices<
89+
info::partition_property::partition_by_affinity_domain>(
90+
info::partition_affinity_domain::numa);
91+
92+
std::cout << "Created " << SubSubDevicesDomainNuma.size()
93+
<< " sub-subdevices from subdevice 0 using partition by numa "
94+
"affinity domain scheme."
95+
<< std::endl;
96+
} catch (feature_not_supported) {
97+
// okay skip it
98+
}
99+
100+
try {
101+
auto SubDevicesDomainL4 = dev.create_sub_devices<
102+
info::partition_property::partition_by_affinity_domain>(
103+
info::partition_affinity_domain::L4_cache);
104+
std::cout << "Created " << SubDevicesDomainL4.size()
105+
<< " subdevices using partition by L4 cache domain scheme."
106+
<< std::endl;
107+
} catch (feature_not_supported) {
108+
// okay skip it
109+
}
110+
111+
try {
112+
auto SubDevicesDomainL3 = dev.create_sub_devices<
113+
info::partition_property::partition_by_affinity_domain>(
114+
info::partition_affinity_domain::L3_cache);
115+
std::cout << "Created " << SubDevicesDomainL3.size()
116+
<< " subdevices using partition by L3 cache domain scheme."
117+
<< std::endl;
118+
} catch (feature_not_supported) {
119+
// okay skip it
120+
}
121+
122+
try {
123+
auto SubDevicesDomainL2 = dev.create_sub_devices<
124+
info::partition_property::partition_by_affinity_domain>(
125+
info::partition_affinity_domain::L2_cache);
126+
std::cout << "Created " << SubDevicesDomainL2.size()
127+
<< " subdevices using partition by L2 cache domain scheme."
128+
<< std::endl;
129+
} catch (feature_not_supported) {
130+
// okay skip it
131+
}
132+
133+
try {
134+
auto SubDevicesDomainL1 = dev.create_sub_devices<
135+
info::partition_property::partition_by_affinity_domain>(
136+
info::partition_affinity_domain::L1_cache);
137+
std::cout << "Created " << SubDevicesDomainL1.size()
138+
<< " subdevices using partition by L1 cache domain scheme."
139+
<< std::endl;
140+
} catch (feature_not_supported) {
141+
// okay skip it
142+
}
143+
144+
try {
145+
auto SubDevicesDomainNextPart = dev.create_sub_devices<
146+
info::partition_property::partition_by_affinity_domain>(
147+
info::partition_affinity_domain::next_partitionable);
148+
std::cout << "Created " << SubDevicesDomainNextPart.size()
149+
<< " subdevices using partition by next partitionable "
150+
"domain scheme."
151+
<< std::endl;
152+
153+
auto SubSubDevicesDomainNextPart =
154+
SubDevicesDomainNextPart[0]
155+
.create_sub_devices<
156+
info::partition_property::partition_by_affinity_domain>(
157+
info::partition_affinity_domain::next_partitionable);
158+
std::cout << "Created " << SubSubDevicesDomainNextPart.size()
159+
<< " sub-subdevices from subdevice 0 using partition by next "
160+
"partitionable domain scheme."
161+
<< std::endl;
162+
} catch (feature_not_supported) {
163+
// okay skip it
164+
}
165+
}
166+
} catch (exception e) {
167+
std::cout << "SYCL exception caught: " << e.what() << std::endl;
168+
return 1;
169+
}
170+
return 0;
171+
}

SYCL/DeviceCodeSplit/aot-gpu.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22
// UNSUPPORTED: cuda
33
// CUDA does neither support device code splitting nor SPIR.
44
//
5-
// The test is failing with GPU RT 30.0.100.9667
6-
// XFAIL: windows
7-
//
85
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_source \
96
// RUN: -fsycl-targets=spir64_gen-unknown-unknown-sycldevice \
107
// RUN: -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice \

SYCL/ESIMD/PrefixSum.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
184184

185185
simd<ushort, 32> p = elm32 < remaining;
186186

187-
S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, p);
187+
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
188+
p);
188189

189190
auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
190191
cnt_table.column(0) += prev;
@@ -214,7 +215,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
214215
cnt_table.select<1, 1, 16, 1>(j, 16) +=
215216
cnt_table.replicate<1, 0, 16, 0>(j, 15);
216217
}
217-
scatter4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset, p);
218+
scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset,
219+
p);
218220
elm32 += 32;
219221
element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
220222
prev = cnt_table.column(31);
@@ -252,7 +254,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
252254
unsigned n_iter = n_entries / 32;
253255
for (unsigned i = 0; i < n_iter; i++) {
254256

255-
S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
257+
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
256258

257259
auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
258260
cnt_table.column(0) += prev;
@@ -288,7 +290,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
288290
if (i == n_iter - 1)
289291
cnt_table.column(31) -= cnt_table.column(30);
290292

291-
scatter4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset);
293+
scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset);
292294

293295
element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
294296
prev = cnt_table.column(31);

SYCL/ESIMD/Prefix_Local_sum2.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,13 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
7373

7474
simd<unsigned int, 32 * TUPLE_SZ> S, T;
7575

76-
S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
76+
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
7777

7878
#pragma unroll
7979
for (int i = 1; i < PREFIX_ENTRIES / 32; i++) {
8080
element_offset += (stride_elems * 32 * TUPLE_SZ) * sizeof(unsigned);
8181
// scattered read, each inst reads 16 entries
82-
T = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
82+
T = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
8383
S += T;
8484
}
8585

SYCL/ESIMD/Prefix_Local_sum3.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
197197

198198
simd<ushort, 32> p = elm32 < remaining;
199199

200-
S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, p);
200+
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
201+
p);
201202

202203
auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
203204
cnt_table.column(0) += prev;
@@ -226,7 +227,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
226227
cnt_table.select<1, 1, 16, 1>(j, 16) +=
227228
cnt_table.replicate<1, 0, 16, 0>(j, 15);
228229
}
229-
scatter4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset, p);
230+
scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset,
231+
p);
230232
elm32 += 32;
231233
element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
232234
prev = cnt_table.column(31);

SYCL/ESIMD/accessor_gather_scatter.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
6767
Kernel<T, VL, STRIDE> kernel(acc);
6868
cgh.parallel_for(glob_range, kernel);
6969
});
70+
e.wait();
7071
} catch (cl::sycl::exception const &e) {
7172
std::cout << "SYCL exception caught: " << e.what() << '\n';
7273
delete[] A;

SYCL/ESIMD/ext_math.cpp

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,16 @@
99
// UNSUPPORTED: cuda
1010
// RUN: %clangxx -fsycl %s -o %t.out
1111
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
// Enable when driver fixes will be propagated into the official release
13+
// XFAIL: windows
1214

1315
// This test checks extended math operations.
1416

1517
#include "esimd_test_utils.hpp"
1618

1719
#include <CL/sycl.hpp>
1820
#include <CL/sycl/INTEL/esimd.hpp>
21+
#include <CL/sycl/builtins_esimd.hpp>
1922
#include <iostream>
2023

2124
using namespace cl::sycl;
@@ -35,7 +38,16 @@ struct InitDataFuncWide {
3538
struct InitDataFuncNarrow {
3639
void operator()(float *In, float *Out, size_t Size) const {
3740
for (auto I = 0; I < Size; ++I) {
38-
In[I] = 2.0f + 16.0f * ((float)I / (float)(Size - 1)); // in [2..16] range
41+
In[I] = 2.0f + 16.0f * ((float)I / (float)(Size - 1)); // in [2..18] range
42+
Out[I] = (float)0.0;
43+
}
44+
}
45+
};
46+
47+
struct InitDataInRange0_5 {
48+
void operator()(float *In, float *Out, size_t Size) const {
49+
for (auto I = 0; I < Size; ++I) {
50+
In[I] = 5.0f * ((float)I / (float)(Size - 1)); // in [0..5] range
3951
Out[I] = (float)0.0;
4052
}
4153
}
@@ -52,7 +64,7 @@ template <MathOp Op> float HostMathFunc(float X);
5264

5365
// --- Specializations per each extended math operation
5466

55-
#define DEFINE_OP(Op, HostOp) \
67+
#define DEFINE_ESIMD_OP(Op, HostOp) \
5668
template <> float HostMathFunc<MathOp::Op>(float X) { return HostOp(X); } \
5769
template <int VL> struct DeviceMathFunc<VL, MathOp::Op> { \
5870
simd<float, VL> \
@@ -61,13 +73,22 @@ template <MathOp Op> float HostMathFunc(float X);
6173
} \
6274
}
6375

64-
DEFINE_OP(sin, sin);
65-
DEFINE_OP(cos, cos);
66-
DEFINE_OP(exp, exp);
67-
DEFINE_OP(log, log);
68-
DEFINE_OP(inv, 1.0f /);
69-
DEFINE_OP(sqrt, sqrt);
70-
DEFINE_OP(rsqrt, 1.0f / sqrt);
76+
#define DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(Op, HostOp) \
77+
template <> float HostMathFunc<MathOp::Op>(float X) { return HostOp(X); } \
78+
template <int VL> struct DeviceMathFunc<VL, MathOp::Op> { \
79+
simd<float, VL> \
80+
operator()(const simd<float, VL> &X) const SYCL_ESIMD_FUNCTION { \
81+
return sycl::Op<VL>(X); \
82+
} \
83+
}
84+
85+
DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(sin, sin);
86+
DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(cos, cos);
87+
DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(exp, exp);
88+
DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(log, log);
89+
DEFINE_ESIMD_OP(inv, 1.0f /);
90+
DEFINE_ESIMD_OP(sqrt, sqrt);
91+
DEFINE_ESIMD_OP(rsqrt, 1.0f / sqrt);
7192

7293
// --- Generic kernel calculating an extended math operation on array elements
7394

@@ -159,13 +180,10 @@ template <int VL> bool test(queue &Q) {
159180
Pass &= test<MathOp::sqrt, VL>(Q, "sqrt", InitDataFuncWide{});
160181
Pass &= test<MathOp::inv, VL>(Q, "inv");
161182
Pass &= test<MathOp::rsqrt, VL>(Q, "rsqrt");
162-
// TODO enable these tests after the implementation is fixed
163-
#if ENABLE_SIN_COS_EXP_LOG
164183
Pass &= test<MathOp::sin, VL>(Q, "sin", InitDataFuncWide{});
165184
Pass &= test<MathOp::cos, VL>(Q, "cos", InitDataFuncWide{});
166-
Pass &= test<MathOp::exp, VL>(Q, "exp");
185+
Pass &= test<MathOp::exp, VL>(Q, "exp", InitDataInRange0_5{});
167186
Pass &= test<MathOp::log, VL>(Q, "log", InitDataFuncWide{});
168-
#endif
169187
return Pass;
170188
}
171189

0 commit comments

Comments
 (0)