Skip to content

Commit 984057e

Browse files
committed
test to follow up on recent fix intel/llvm#7589. Test performs a reduction using sycl::vec<float>. Is sensitive to byte size vs. vector size fix in SYCL headers.
1 parent adbf7ab commit 984057e

File tree

1 file changed

+109
-0
lines changed

1 file changed

+109
-0
lines changed
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
5+
6+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -O0 %s -o %t_O0.out
7+
// RUN: %CPU_RUN_PLACEHOLDER %t_O0.out
8+
// RUN: %GPU_RUN_PLACEHOLDER %t_O0.out
9+
// RUN: %ACC_RUN_PLACEHOLDER %t_O0.out
10+
11+
/*
12+
test performs a lattice reduction.
13+
sycl::vec<float> is sensitive to .get_size() vs .size() in SYCL headers
14+
(ie, byte size versus vector size)
15+
*/
16+
17+
#include <sycl/sycl.hpp>
18+
19+
using namespace sycl;
20+
21+
#define NX 32
22+
#define NZ 2
23+
#define NV 8
24+
25+
template <typename T>
26+
void groupSum(T *r, const T &in, const int k, sycl::group<2> &grp,
27+
const int i) {
28+
29+
T tin = (i == k ? in : T(0));
30+
auto out = reduce_over_group(grp, tin, sycl::plus<>());
31+
if (i == k && grp.get_local_id()[1] == 0)
32+
r[k] = out;
33+
}
34+
35+
void test(queue q, float *r, float *x,
36+
int n) { // r is 16 floats, x is 256 floats. n is 256
37+
38+
sycl::range<2> globalSize(NZ, NX); // 2,32
39+
sycl::range<2> localSize(1, NX); // 1,8 so 16 iterations
40+
sycl::nd_range<2> range{globalSize, localSize};
41+
42+
q.submit([&](sycl::handler &h) {
43+
h.parallel_for<>(range, [=](sycl::nd_item<2> ndi) {
44+
int i = ndi.get_global_id(1);
45+
int k = ndi.get_global_id(0);
46+
47+
using vecn = sycl::vec<float, NV>; // 8 floats
48+
auto vx = reinterpret_cast<vecn *>(x);
49+
auto vr = reinterpret_cast<vecn *>(r);
50+
51+
auto myg = ndi.get_group();
52+
53+
for (int iz = 0; iz < NZ; iz++) { // loop over Z (2)
54+
groupSum(vr, vx[k * NX + i], k, myg, iz);
55+
}
56+
});
57+
});
58+
q.wait();
59+
}
60+
61+
int main() {
62+
63+
queue q{default_selector_v};
64+
auto dev = q.get_device();
65+
std::cout << "Device: " << dev.get_info<info::device::name>() << std::endl;
66+
67+
auto ctx = q.get_context();
68+
int n = NX * NZ * NV; // 16 * 8 * 2 => 256
69+
auto *x = (float *)sycl::malloc_shared(n * sizeof(float), dev,
70+
ctx); // 256 * sizeof(float)
71+
auto *r = (float *)sycl::malloc_shared(
72+
NZ * NV * sizeof(float), dev, ctx); // 2 * 8 => 16 ( * sizeof(float) )
73+
74+
for (int i = 0; i < n; i++) {
75+
x[i] = i;
76+
}
77+
78+
q.wait();
79+
80+
test(q, r, x, n);
81+
82+
int fails = 0;
83+
for (int k = 0; k < NZ; k++) {
84+
float s[NV] = {0};
85+
for (int i = 0; i < NX; i++) {
86+
for (int j = 0; j < NV; j++) {
87+
s[j] += x[(k * NX + i) * NV + j];
88+
}
89+
}
90+
for (int j = 0; j < NV; j++) {
91+
auto d = s[j] - r[k * NV + j];
92+
if (abs(d) > 1e-10) {
93+
printf("partial fail ");
94+
printf("%i\t%i\t%g\t%g\n", k, j, s[j], r[k * NV + j]);
95+
fails++;
96+
} else {
97+
printf("partial pass ");
98+
printf("%i\t%i\t%g\t%g\n", k, j, s[j], r[k * NV + j]);
99+
}
100+
}
101+
}
102+
103+
if (fails == 0) {
104+
printf("test passed!\n");
105+
} else {
106+
printf("test failed!\n");
107+
}
108+
assert(fails == 0);
109+
}

0 commit comments

Comments
 (0)