Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] add ESIMD tests #29

Merged
merged 3 commits into from
Oct 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
686 changes: 686 additions & 0 deletions SYCL/ESIMD/BitonicSortK.cpp

Large diffs are not rendered by default.

601 changes: 601 additions & 0 deletions SYCL/ESIMD/BitonicSortKv2.cpp

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions SYCL/ESIMD/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Overview
This directory contains ESIMD tests which are run on Intel GPU device only.
Some of them can run on host device too, but in general it is not always
possible as some of ESIMD APIs (e.g. memory access via accessors) is not
implemented for the host device.
187 changes: 187 additions & 0 deletions SYCL/ESIMD/Stencil.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
//==---------------- stencil.cpp - DPC++ ESIMD on-device test ------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// TODO enable on Windows and Level Zero
// REQUIRES: linux && gpu && opencl
// RUN: %clangxx-esimd -fsycl %s -o %t.out
// RUN: %HOST_RUN_PLACEHOLDER %t.out
// RUN: %ESIMD_RUN_PLACEHOLDER %t.out

#include "esimd_test_utils.hpp"

#include <CL/sycl.hpp>
#include <CL/sycl/INTEL/esimd.hpp>
#include <iostream>

//
// test smaller input size
// test 8x16 block size
//
#define DIM_SIZE (1 << 10)
#define SQUARE_SZ (DIM_SIZE * DIM_SIZE)

#define WIDTH 16
#define HEIGHT 16

using namespace cl::sycl;

void InitializeSquareMatrix(float *matrix, size_t const Dim,
bool const bSkipDataGeneration) {
memset(matrix, 0, Dim * Dim * sizeof(float));
if (!bSkipDataGeneration) {
for (unsigned int iRow = 0; iRow < Dim; ++iRow) {
for (unsigned int iCol = 0; iCol < Dim; ++iCol) {
matrix[iRow * Dim + iCol] = static_cast<float>(iRow + iCol);
}
}
}
}

bool CheckResults(float *out, float *in) {
unsigned int n = DIM_SIZE;
for (unsigned int i = 0; i < n; i++) {
for (unsigned int j = 0; j < n; j++) {
if ((5 <= i) && (i < n - 5) && (5 <= j) && (j < n - 5)) {
float res = +in[(i - 5) * n + (j + 0)] * -0.02f +
in[(i - 4) * n + (j + 0)] * -0.025f +
in[(i - 3) * n + (j + 0)] * -0.0333333333333f +
in[(i - 2) * n + (j + 0)] * -0.05f +
in[(i - 1) * n + (j + 0)] * -0.1f +
in[(i + 0) * n + (j - 5)] * -0.02f +
in[(i + 0) * n + (j - 4)] * -0.025f +
in[(i + 0) * n + (j - 3)] * -0.0333333333333f +
in[(i + 0) * n + (j - 2)] * -0.05f +
in[(i + 0) * n + (j - 1)] * -0.1f +
in[(i + 0) * n + (j + 1)] * 0.1f +
in[(i + 0) * n + (j + 2)] * 0.05f +
in[(i + 0) * n + (j + 3)] * 0.0333333333333f +
in[(i + 0) * n + (j + 4)] * 0.025f +
in[(i + 0) * n + (j + 5)] * 0.02f +
in[(i + 1) * n + (j + 0)] * 0.1f +
in[(i + 2) * n + (j + 0)] * 0.05f +
in[(i + 3) * n + (j + 0)] * 0.0333333333333f +
in[(i + 4) * n + (j + 0)] * 0.025f +
in[(i + 5) * n + (j + 0)] * 0.02f;

// check result
if (abs(res - out[i * n + j]) >= 0.0015f) {
std::cout << "out[" << i << "][" << j << "] = " << out[i * n + j]
<< " expect result " << res << std::endl;
return false;
}
}
}
}
return true;
}

int main(void) {
uint range_width =
(DIM_SIZE - 10) / WIDTH + (((DIM_SIZE - 10) % WIDTH == 0) ? 0 : 1);
uint range_height =
(DIM_SIZE - 10) / HEIGHT + (((DIM_SIZE - 10) % HEIGHT == 0) ? 0 : 1);
cl::sycl::range<2> GlobalRange{range_width, range_height};

std::cout << "width = " << range_width << " height = " << range_height
<< std::endl;
cl::sycl::range<2> LocalRange{1, 1};

queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());

auto dev = q.get_device();
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
auto ctxt = q.get_context();

// create and init matrices
float *inputMatrix =
static_cast<float *>(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt));
float *outputMatrix =
static_cast<float *>(malloc_shared(SQUARE_SZ * sizeof(float), dev, ctxt));
InitializeSquareMatrix(inputMatrix, DIM_SIZE, false);
InitializeSquareMatrix(outputMatrix, DIM_SIZE, true);

auto e = q.submit([&](handler &cgh) {
cgh.parallel_for<class Stencil_kernel>(
GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
uint h_pos = it.get_id(0);
uint v_pos = it.get_id(1);

simd<float, (HEIGHT + 10) * 32> vin;
// matrix HEIGHT+10 x 32
auto in = vin.format<float, HEIGHT + 10, 32>();

//
// rather than loading all data in
// the code will interleave data loading and compute
// first, we load enough data for the first 16 pixels
//
unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
#pragma unroll
for (unsigned i = 0; i < 10; i++) {
in.row(i) = block_load<float, 32>(inputMatrix + off);
off += DIM_SIZE;
}

unsigned out_off =
(((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) *
sizeof(float);
simd<unsigned, WIDTH> elm16(0, 1);

#pragma unroll
for (unsigned i = 0; i < HEIGHT; i++) {

in.row(10 + i) = block_load<float, 32>(inputMatrix + off);
off += DIM_SIZE;

simd<float, WIDTH> sum =
in.row(i + 0).select<WIDTH, 1>(5) * -0.02f +
in.row(i + 1).select<WIDTH, 1>(5) * -0.025f +
in.row(i + 2).select<WIDTH, 1>(5) * -0.0333333333333f +
in.row(i + 3).select<WIDTH, 1>(5) * -0.05f +
in.row(i + 4).select<WIDTH, 1>(5) * -0.1f +
in.row(i + 6).select<WIDTH, 1>(5) * 0.1f +
in.row(i + 7).select<WIDTH, 1>(5) * 0.05f +
in.row(i + 8).select<WIDTH, 1>(5) * 0.0333333333333f +
in.row(i + 9).select<WIDTH, 1>(5) * 0.025f +
in.row(i + 10).select<WIDTH, 1>(5) * 0.02f +
in.row(i + 5).select<WIDTH, 1>(0) * -0.02f +
in.row(i + 5).select<WIDTH, 1>(1) * -0.025f +
in.row(i + 5).select<WIDTH, 1>(2) * -0.0333333333333f +
in.row(i + 5).select<WIDTH, 1>(3) * -0.05f +
in.row(i + 5).select<WIDTH, 1>(4) * -0.1f +
in.row(i + 5).select<WIDTH, 1>(6) * 0.1f +
in.row(i + 5).select<WIDTH, 1>(7) * 0.05f +
in.row(i + 5).select<WIDTH, 1>(8) * 0.0333333333333f +
in.row(i + 5).select<WIDTH, 1>(9) * 0.025f +
in.row(i + 5).select<WIDTH, 1>(10) * 0.02f;

// predciate output
simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10;

simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off;
scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p);
out_off += DIM_SIZE * sizeof(float);

if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1)
break;
}
});
});
e.wait();

// check result
bool passed = CheckResults(outputMatrix, inputMatrix);
if (passed) {
std::cout << "PASSED" << std::endl;
} else {
std::cout << "FAILED" << std::endl;
}
free(inputMatrix, ctxt);
free(outputMatrix, ctxt);
return 0;
}
98 changes: 98 additions & 0 deletions SYCL/ESIMD/accessor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
//==---------------- accessor.cpp - DPC++ ESIMD on-device test ------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// TODO enable on Windows and Level Zero
// REQUIRES: linux && gpu && opencl
// RUN: %clangxx-esimd -fsycl -D_CRT_SECURE_NO_WARNINGS=1 %s -o %t.out
// RUN: %ESIMD_RUN_PLACEHOLDER %t.out

// This test checks that accessor-based memory accesses work correctly in ESIMD.

#include "esimd_test_utils.hpp"

#include <CL/sycl.hpp>
#include <CL/sycl/INTEL/esimd.hpp>

#include <iostream>

using namespace cl::sycl;

constexpr unsigned int VL = 1024 * 128;

using Ty = float;

int main() {
Ty data0[VL] = {0};
Ty data1[VL] = {0};
constexpr Ty VAL = 5;

for (int i = 0; i < VL; i++) {
data0[i] = i;
}

try {
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());

buffer<Ty, 1> buf0(data0, range<1>(VL));
buffer<Ty, 1> buf1(data1, range<1>(VL));

q.submit([&](handler &cgh) {
std::cout << "Running on "
<< q.get_device().get_info<cl::sycl::info::device::name>()
<< "\n";

auto acc0 = buf0.get_access<access::mode::read_write>(cgh);
auto acc1 = buf1.get_access<access::mode::write>(cgh);

cgh.parallel_for<class Test>(
range<1>(1), [=](sycl::id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
unsigned int offset = 0;
for (int k = 0; k < VL / 16; k++) {
simd<Ty, 16> var = block_load<Ty, 16>(acc0, offset);
var += VAL;
block_store(acc0, offset, var);
block_store(acc1, offset, var + 1);
offset += 64;
}
});
});

q.wait();

} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return 2;
}

int err_cnt = 0;

for (int i = 0; i < VL; i++) {
Ty gold0 = i + VAL;
Ty gold1 = gold0 + 1;
Ty val0 = data0[i];
Ty val1 = data1[i];

if (val0 != gold0) {
if (++err_cnt < 10)
std::cerr << "*** ERROR at data0[" << i << "]: " << val0
<< " != " << gold0 << "(gold)\n";
}
if (val1 != gold1) {
if (++err_cnt < 10)
std::cerr << "*** ERROR at data1[" << i << "]: " << val1
<< " != " << gold1 << "(gold)\n";
}
}
if (err_cnt == 0) {
std::cout << "Passed\n";
return 0;
} else {
std::cout << "Failed: " << err_cnt << " of " << VL << " errors\n";
return 1;
}
}
Loading