Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[ESIMD] Rewrite Stencils tests using buffers #231

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 22 additions & 19 deletions SYCL/ESIMD/Stencil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
// test 8x16 block size
//
#define DIM_SIZE (1 << 13)
#define SQUARE_SZ (DIM_SIZE * DIM_SIZE + 16)
#define SQUARE_SZ (DIM_SIZE * DIM_SIZE)

#define WIDTH 16
#define HEIGHT 16
Expand Down Expand Up @@ -97,15 +97,18 @@ int main(void) {
auto ctxt = q.get_context();

// create and init matrices
float *inputMatrix =
static_cast<float *>(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt));
float *outputMatrix =
static_cast<float *>(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt));
float *inputMatrix = new float[SQUARE_SZ];
float *outputMatrix = new float[SQUARE_SZ];
InitializeSquareMatrix(inputMatrix, DIM_SIZE, false);
InitializeSquareMatrix(outputMatrix, DIM_SIZE, true);

try {
buffer<float, 1> buf_in(inputMatrix, range<1>(SQUARE_SZ));
buffer<float, 1> buf_out(outputMatrix, range<1>(SQUARE_SZ));

auto e = q.submit([&](handler &cgh) {
auto input = buf_in.get_access<access::mode::read>(cgh);
auto output = buf_out.get_access<access::mode::write>(cgh);
cgh.parallel_for<class Stencil_kernel>(
GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
Expand All @@ -121,23 +124,23 @@ int main(void) {
// the code will interleave data loading and compute
// first, we load enough data for the first 16 pixels
//
unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
unsigned off =
((v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH) * sizeof(float);
#pragma unroll
for (unsigned i = 0; i < 10; i++) {
in.row(i) = block_load<float, 32>(inputMatrix + off);
off += DIM_SIZE;
in.row(i) = block_load<float, 32>(input, off);
off += DIM_SIZE * sizeof(float);
}

unsigned out_off =
(((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) *
sizeof(float);
((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5);
simd<unsigned, WIDTH> elm16(0, 1);

#pragma unroll
for (unsigned i = 0; i < HEIGHT; i++) {

in.row(10 + i) = block_load<float, 32>(inputMatrix + off);
off += DIM_SIZE;
in.row(10 + i) = block_load<float, 32>(input, off);
off += DIM_SIZE * sizeof(float);

simd<float, WIDTH> sum =
in.row(i + 0).select<WIDTH, 1>(5) * -0.02f +
Expand All @@ -164,9 +167,9 @@ int main(void) {
// predciate output
simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10;

simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off;
scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p);
out_off += DIM_SIZE * sizeof(float);
simd<unsigned, WIDTH> elm16_off = elm16 + out_off;
scatter<float, WIDTH>(output, sum, elm16_off, 0, p);
out_off += DIM_SIZE;

if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1)
break;
Expand All @@ -176,8 +179,8 @@ int main(void) {
e.wait();
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
free(inputMatrix, ctxt);
free(outputMatrix, ctxt);
delete[] inputMatrix;
delete[] outputMatrix;
return e.get_cl_code();
}

Expand All @@ -188,7 +191,7 @@ int main(void) {
} else {
std::cout << "FAILED" << std::endl;
}
free(inputMatrix, ctxt);
free(outputMatrix, ctxt);
delete[] inputMatrix;
delete[] outputMatrix;
return 0;
}
41 changes: 22 additions & 19 deletions SYCL/ESIMD/stencil2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
// test 8x16 block size
//
#define DIM_SIZE (1 << 13)
#define SQUARE_SZ (DIM_SIZE * DIM_SIZE + 16)
#define SQUARE_SZ (DIM_SIZE * DIM_SIZE)

#define WIDTH 16
#define HEIGHT 16
Expand Down Expand Up @@ -99,15 +99,18 @@ int main(void) {
auto ctxt = q.get_context();

// create and init matrices
float *inputMatrix =
static_cast<float *>(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt));
float *outputMatrix =
static_cast<float *>(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt));
float *inputMatrix = new float[SQUARE_SZ];
float *outputMatrix = new float[SQUARE_SZ];
InitializeSquareMatrix(inputMatrix, DIM_SIZE, false);
InitializeSquareMatrix(outputMatrix, DIM_SIZE, true);

try {
buffer<float, 1> buf_in(inputMatrix, range<1>(SQUARE_SZ));
buffer<float, 1> buf_out(outputMatrix, range<1>(SQUARE_SZ));

auto e = q.submit([&](handler &cgh) {
auto input = buf_in.get_access<access::mode::read>(cgh);
auto output = buf_out.get_access<access::mode::write>(cgh);
cgh.parallel_for<class Stencil_kernel>(
GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
Expand All @@ -123,23 +126,23 @@ int main(void) {
// the code will interleave data loading and compute
// first, we load enough data for the first 16 pixels
//
unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
unsigned off =
((v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH) * sizeof(float);
#pragma unroll
for (unsigned i = 0; i < 10; i++) {
in.row(i) = block_load<float, 32>(inputMatrix + off);
off += DIM_SIZE;
in.row(i) = block_load<float, 32>(input, off);
off += DIM_SIZE * sizeof(float);
}

unsigned out_off =
(((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) *
sizeof(float);
((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5);
simd<unsigned, WIDTH> elm16(0, 1);

#pragma unroll
for (unsigned i = 0; i < HEIGHT; i++) {

in.row(10 + i) = block_load<float, 32>(inputMatrix + off);
off += DIM_SIZE;
in.row(10 + i) = block_load<float, 32>(input, off);
off += DIM_SIZE * sizeof(float);

simd<float, WIDTH> sum =
vin.select<WIDTH, 1>(GET_IDX(i, 5)) * -0.02f +
Expand All @@ -166,9 +169,9 @@ int main(void) {
// predciate output
simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10;

simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off;
scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p);
out_off += DIM_SIZE * sizeof(float);
simd<unsigned, WIDTH> elm16_off = elm16 + out_off;
scatter<float, WIDTH>(output, sum, elm16_off, 0, p);
out_off += DIM_SIZE;

if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1)
break;
Expand All @@ -178,8 +181,8 @@ int main(void) {
e.wait();
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
free(inputMatrix, ctxt);
free(outputMatrix, ctxt);
delete[] inputMatrix;
delete[] outputMatrix;
return e.get_cl_code();
}

Expand All @@ -190,7 +193,7 @@ int main(void) {
} else {
std::cout << "FAILED" << std::endl;
}
free(inputMatrix, ctxt);
free(outputMatrix, ctxt);
delete[] inputMatrix;
delete[] outputMatrix;
return 0;
}