10
10
// RUN: %clangxx -fsycl %s -o %t.out
11
11
// RUN: %HOST_RUN_PLACEHOLDER %t.out
12
12
// RUN: %GPU_RUN_PLACEHOLDER %t.out
13
- // Temporary disabled on Windows
14
- // UNSUPPORTED: windows
15
13
16
14
#include " esimd_test_utils.hpp"
17
15
24
22
// test 8x16 block size
25
23
//
26
24
#define DIM_SIZE (1 << 13 )
27
- #define SQUARE_SZ (DIM_SIZE * DIM_SIZE + 16 )
25
+ #define SQUARE_SZ (DIM_SIZE * DIM_SIZE)
28
26
29
27
#define WIDTH 16
30
28
#define HEIGHT 16
@@ -101,15 +99,18 @@ int main(void) {
101
99
auto ctxt = q.get_context ();
102
100
103
101
// create and init matrices
104
- float *inputMatrix =
105
- static_cast <float *>(malloc_shared (SQUARE_SZ * sizeof (float ), dev, ctxt));
106
- float *outputMatrix =
107
- static_cast <float *>(malloc_shared (SQUARE_SZ * sizeof (float ), dev, ctxt));
102
+ float *inputMatrix = new float [SQUARE_SZ];
103
+ float *outputMatrix = new float [SQUARE_SZ];
108
104
InitializeSquareMatrix (inputMatrix, DIM_SIZE, false );
109
105
InitializeSquareMatrix (outputMatrix, DIM_SIZE, true );
110
106
111
107
try {
108
+ buffer<float , 1 > buf_in (inputMatrix, range<1 >(SQUARE_SZ));
109
+ buffer<float , 1 > buf_out (outputMatrix, range<1 >(SQUARE_SZ));
110
+
112
111
auto e = q.submit ([&](handler &cgh) {
112
+ auto input = buf_in.get_access <access::mode::read>(cgh);
113
+ auto output = buf_out.get_access <access::mode::write>(cgh);
113
114
cgh.parallel_for <class Stencil_kernel >(
114
115
GlobalRange * LocalRange, [=](item<2 > it) SYCL_ESIMD_KERNEL {
115
116
using namespace sycl ::INTEL::gpu;
@@ -125,23 +126,23 @@ int main(void) {
125
126
// the code will interleave data loading and compute
126
127
// first, we load enough data for the first 16 pixels
127
128
//
128
- unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
129
+ unsigned off =
130
+ ((v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH) * sizeof (float );
129
131
#pragma unroll
130
132
for (unsigned i = 0 ; i < 10 ; i++) {
131
- in.row (i) = block_load<float , 32 >(inputMatrix + off);
132
- off += DIM_SIZE;
133
+ in.row (i) = block_load<float , 32 >(input, off);
134
+ off += DIM_SIZE * sizeof ( float ) ;
133
135
}
134
136
135
137
unsigned out_off =
136
- (((v_pos * HEIGHT + 5 ) * DIM_SIZE + (h_pos * WIDTH) + 5 )) *
137
- sizeof (float );
138
+ ((v_pos * HEIGHT + 5 ) * DIM_SIZE + (h_pos * WIDTH) + 5 );
138
139
simd<unsigned , WIDTH> elm16 (0 , 1 );
139
140
140
141
#pragma unroll
141
142
for (unsigned i = 0 ; i < HEIGHT; i++) {
142
143
143
- in.row (10 + i) = block_load<float , 32 >(inputMatrix + off);
144
- off += DIM_SIZE;
144
+ in.row (10 + i) = block_load<float , 32 >(input, off);
145
+ off += DIM_SIZE * sizeof ( float ) ;
145
146
146
147
simd<float , WIDTH> sum =
147
148
vin.select <WIDTH, 1 >(GET_IDX (i, 5 )) * -0 .02f +
@@ -168,9 +169,9 @@ int main(void) {
168
169
// predciate output
169
170
simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10 ;
170
171
171
- simd<unsigned , WIDTH> elm16_off = elm16 * sizeof ( float ) + out_off;
172
- scatter<float , WIDTH>(outputMatrix , sum, elm16_off, p);
173
- out_off += DIM_SIZE * sizeof ( float ) ;
172
+ simd<unsigned , WIDTH> elm16_off = elm16 + out_off;
173
+ scatter<float , WIDTH>(output , sum, elm16_off, 0 , p);
174
+ out_off += DIM_SIZE;
174
175
175
176
if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1 )
176
177
break ;
@@ -180,8 +181,8 @@ int main(void) {
180
181
e.wait ();
181
182
} catch (cl::sycl::exception const &e) {
182
183
std::cout << " SYCL exception caught: " << e.what () << ' \n ' ;
183
- free (inputMatrix, ctxt) ;
184
- free (outputMatrix, ctxt) ;
184
+ delete[] inputMatrix ;
185
+ delete[] outputMatrix ;
185
186
return e.get_cl_code ();
186
187
}
187
188
@@ -192,7 +193,7 @@ int main(void) {
192
193
} else {
193
194
std::cout << " FAILED" << std::endl;
194
195
}
195
- free (inputMatrix, ctxt) ;
196
- free (outputMatrix, ctxt) ;
196
+ delete[] inputMatrix ;
197
+ delete[] outputMatrix ;
197
198
return 0 ;
198
199
}
0 commit comments