1
+ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2
+ // RUN: %CPU_RUN_PLACEHOLDER %t.out
3
+ // RUN: %GPU_RUN_PLACEHOLDER %t.out
4
+ // RUN: %ACC_RUN_PLACEHOLDER %t.out
5
+
6
+ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -O0 %s -o %t_O0.out
7
+ // RUN: %CPU_RUN_PLACEHOLDER %t_O0.out
8
+ // RUN: %GPU_RUN_PLACEHOLDER %t_O0.out
9
+ // RUN: %ACC_RUN_PLACEHOLDER %t_O0.out
10
+
11
+ /*
12
+ test performs a lattice reduction.
13
+ sycl::vec<float> is sensitive to .get_size() vs .size() in SYCL headers
14
+ (ie, byte size versus vector size)
15
+ */
16
+
17
+ #include < sycl/sycl.hpp>
18
+
19
+ using namespace sycl ;
20
+
21
+ #define NX 32
22
+ #define NZ 2
23
+ #define NV 8
24
+
25
+ template <typename T>
26
+ void groupSum (T *r, const T &in, const int k, sycl::group<2 > &grp,
27
+ const int i) {
28
+
29
+ T tin = (i == k ? in : T (0 ));
30
+ auto out = reduce_over_group (grp, tin, sycl::plus<>());
31
+ if (i == k && grp.get_local_id ()[1 ] == 0 )
32
+ r[k] = out;
33
+ }
34
+
35
+ void test (queue q, float *r, float *x,
36
+ int n) { // r is 16 floats, x is 256 floats. n is 256
37
+
38
+ sycl::range<2 > globalSize (NZ, NX); // 2,32
39
+ sycl::range<2 > localSize (1 , NX); // 1,8 so 16 iterations
40
+ sycl::nd_range<2 > range{globalSize, localSize};
41
+
42
+ q.submit ([&](sycl::handler &h) {
43
+ h.parallel_for <>(range, [=](sycl::nd_item<2 > ndi) {
44
+ int i = ndi.get_global_id (1 );
45
+ int k = ndi.get_global_id (0 );
46
+
47
+ using vecn = sycl::vec<float , NV>; // 8 floats
48
+ auto vx = reinterpret_cast <vecn *>(x);
49
+ auto vr = reinterpret_cast <vecn *>(r);
50
+
51
+ auto myg = ndi.get_group ();
52
+
53
+ for (int iz = 0 ; iz < NZ; iz++) { // loop over Z (2)
54
+ groupSum (vr, vx[k * NX + i], k, myg, iz);
55
+ }
56
+ });
57
+ });
58
+ q.wait ();
59
+ }
60
+
61
+ int main () {
62
+
63
+ queue q{default_selector_v};
64
+ auto dev = q.get_device ();
65
+ std::cout << " Device: " << dev.get_info <info::device::name>() << std::endl;
66
+
67
+ auto ctx = q.get_context ();
68
+ int n = NX * NZ * NV; // 16 * 8 * 2 => 256
69
+ auto *x = (float *)sycl::malloc_shared (n * sizeof (float ), dev,
70
+ ctx); // 256 * sizeof(float)
71
+ auto *r = (float *)sycl::malloc_shared (
72
+ NZ * NV * sizeof (float ), dev, ctx); // 2 * 8 => 16 ( * sizeof(float) )
73
+
74
+ for (int i = 0 ; i < n; i++) {
75
+ x[i] = i;
76
+ }
77
+
78
+ q.wait ();
79
+
80
+ test (q, r, x, n);
81
+
82
+ int fails = 0 ;
83
+ for (int k = 0 ; k < NZ; k++) {
84
+ float s[NV] = {0 };
85
+ for (int i = 0 ; i < NX; i++) {
86
+ for (int j = 0 ; j < NV; j++) {
87
+ s[j] += x[(k * NX + i) * NV + j];
88
+ }
89
+ }
90
+ for (int j = 0 ; j < NV; j++) {
91
+ auto d = s[j] - r[k * NV + j];
92
+ if (abs (d) > 1e-10 ) {
93
+ printf (" partial fail " );
94
+ printf (" %i\t %i\t %g\t %g\n " , k, j, s[j], r[k * NV + j]);
95
+ fails++;
96
+ } else {
97
+ printf (" partial pass " );
98
+ printf (" %i\t %i\t %g\t %g\n " , k, j, s[j], r[k * NV + j]);
99
+ }
100
+ }
101
+ }
102
+
103
+ if (fails == 0 ) {
104
+ printf (" test passed!\n " );
105
+ } else {
106
+ printf (" test failed!\n " );
107
+ }
108
+ assert (fails == 0 );
109
+ }
0 commit comments