Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Add ESIMD tests with SYCL 2020 spec constants #291

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 199 additions & 0 deletions SYCL/ESIMD/histogram_256_slm_spec_2020.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
// TODO enable on Windows
// REQUIRES: linux && gpu
// UNSUPPORTED: cuda
// RUN: %clangxx -fsycl %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out 16

#include "esimd_test_utils.hpp"

#include <CL/sycl.hpp>
#include <CL/sycl/INTEL/esimd.hpp>
#include <iostream>

static constexpr int NUM_BINS = 256;
static constexpr int SLM_SIZE = (NUM_BINS * 4);
static constexpr int BLOCK_WIDTH = 32;
static constexpr int NUM_BLOCKS = 32;

using namespace cl::sycl;
using namespace sycl::ext::intel::experimental::esimd;

constexpr specialization_id<unsigned int> NumBlocksSpecId(NUM_BLOCKS);

// Histogram kernel: computes the distribution of pixel intensities
ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
uint32_t gid, uint32_t lid,
uint32_t local_size, uint32_t num_blocks) {
// Declare and initialize SLM
slm_init(SLM_SIZE);
uint linear_id = gid * local_size + lid;

simd<uint, 16> slm_offset(0, 1);
slm_offset += 16 * lid;
slm_offset *= sizeof(int);
simd<uint, 16> slm_data = 0;
slm_store<uint, 16>(slm_data, slm_offset);
esimd_barrier();

// Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks
auto start_off = (linear_id * BLOCK_WIDTH * num_blocks);
for (int y = 0; y < num_blocks; y++) {
auto start_addr = ((unsigned int *)input_ptr) + start_off;
auto data = block_load<uint, 32>(start_addr);
auto in = data.format<uchar>();

#pragma unroll
for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
// Accumulate local histogram for each pixel value
simd<uint, 16> dataOffset = in.select<16, 1>(j).read();
dataOffset *= sizeof(int);
slm_atomic<EsimdAtomicOpType::ATOMIC_INC, uint, 16>(dataOffset, 1);
}
start_off += BLOCK_WIDTH;
}
esimd_barrier();

// Update global sum by atomically adding each local histogram
simd<uint, 16> local_histogram;
local_histogram = slm_load<uint32_t, 16>(slm_offset);
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1);
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
output, slm_offset.select<8, 1>(8), local_histogram.select<8, 1>(8), 1);
}

// This function calculates histogram of the image with the CPU.
// @param size: the size of the input array.
// @param src: pointer to the input array.
// @param cpu_histogram: pointer to the histogram of the input image.
void HistogramCPU(unsigned int size, unsigned int *src,
unsigned int *cpu_histogram) {
for (int i = 0; i < size; i++) {
unsigned int x = src[i];
cpu_histogram[(x)&0xFFU] += 1;
cpu_histogram[(x >> 8) & 0xFFU] += 1;
cpu_histogram[(x >> 16) & 0xFFU] += 1;
cpu_histogram[(x >> 24) & 0xFFU] += 1;
}
}

// This function compares the output data calculated by the CPU and the
// GPU separately.
// If they are identical, return 1, else return 0.
int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) {
unsigned int bad = 0;
for (int i = 0; i < NUM_BINS; i++) {
if (cpu_histogram[i] != gpu_histogram[i]) {
std::cout << "At " << i << ": CPU = " << cpu_histogram[i]
<< ", GPU = " << gpu_histogram[i] << std::endl;
if (bad >= 256)
return 0;
bad++;
}
}
if (bad > 0)
return 0;

return 1;
}

class NumBlocksConst;
class histogram_slm;

int main(int argc, char **argv) {
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
auto dev = q.get_device();
auto ctxt = q.get_context();

const char *input_file = nullptr;
unsigned int width = 1024 * sizeof(unsigned int);
unsigned int height = 1024;

// Initializes input.
unsigned int input_size = width * height;
unsigned int *input_ptr =
(unsigned int *)malloc_shared(input_size, dev, ctxt);
printf("Processing %dx%d inputs\n", (int)(width / sizeof(unsigned int)),
height);

srand(2009);
input_size = input_size / sizeof(int);
for (int i = 0; i < input_size; ++i) {
input_ptr[i] = rand() % 256;
input_ptr[i] |= (rand() % 256) << 8;
input_ptr[i] |= (rand() % 256) << 16;
input_ptr[i] |= (rand() % 256) << 24;
}

// Allocates system memory for output buffer.
int buffer_size = sizeof(unsigned int) * NUM_BINS;
unsigned int *hist = new unsigned int[buffer_size];
if (hist == nullptr) {
std::cerr << "Out of memory\n";
exit(1);
}
memset(hist, 0, buffer_size);

// Uses the CPU to calculate the histogram output data.
unsigned int cpu_histogram[NUM_BINS];
memset(cpu_histogram, 0, sizeof(cpu_histogram));

HistogramCPU(input_size, input_ptr, cpu_histogram);

std::cout << "finish cpu_histogram\n";

// Uses the GPU to calculate the histogram output data.
unsigned int *output_surface =
(uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt);
memset(output_surface, 0, 4 * NUM_BINS);

unsigned int num_blocks{NUM_BLOCKS};
if (argc == 2) {
num_blocks = atoi(argv[1]);
std::cout << "new num_blocks = " << num_blocks << "\n";
}

unsigned int num_threads;
num_threads = width * height / (num_blocks * BLOCK_WIDTH * sizeof(int));

auto GlobalRange = cl::sycl::range<1>(num_threads);
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);

try {
auto e = q.submit([&](cl::sycl::handler &cgh) {
cgh.set_specialization_constant<NumBlocksSpecId>(num_blocks);
cgh.parallel_for<histogram_slm>(
Range,
[=](cl::sycl::nd_item<1> ndi, kernel_handler kh) SYCL_ESIMD_KERNEL {
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
ndi.get_local_id(0), 16,
kh.get_specialization_constant<NumBlocksSpecId>());
});
});
e.wait();
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}

std::cout << "finish GPU histogram\n";

memcpy(hist, output_surface, 4 * NUM_BINS);

free(output_surface, ctxt);

free(input_ptr, ctxt);

// Compares the CPU histogram output data with the
// GPU histogram output data.
// If there is no difference, the result is correct.
// Otherwise there is something wrong.
int res = CheckHistogram(cpu_histogram, hist);
if (res)
std::cout << "PASSED\n";
else
std::cout << "FAILED\n";

return res ? 0 : -1;
}
80 changes: 80 additions & 0 deletions SYCL/ESIMD/spec_const/Inputs/spec-const-2020-common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// The test checks that ESIMD kernels support SYCL 2020 specialization constants
// for all basic types, particularly a specialization constant can be redifined
// and correct new value is used after redefinition.

#include "esimd_test_utils.hpp"

#include <CL/sycl.hpp>
#include <CL/sycl/INTEL/esimd.hpp>

#include <iostream>
#include <vector>

using namespace cl::sycl;

template <typename AccessorTy>
ESIMD_INLINE void do_store(AccessorTy acc, int i, spec_const_t val) {
using namespace sycl::ext::intel::experimental::esimd;
// scatter function, that is used in scalar_store, can only process types
// whose size is no more than 4 bytes.
#if (STORE == 0)
// bool
scalar_store(acc, i, val ? 1 : 0);
#elif (STORE == 1)
// block
block_store(acc, i, simd<spec_const_t, 2>{val});
#else
static_assert(STORE == 2, "Unspecified store");
// scalar
scalar_store(acc, i, val);
#endif
}

class TestKernel;

constexpr specialization_id<spec_const_t> ConstID(DEF_VAL);

int main(int argc, char **argv) {
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());

auto dev = q.get_device();
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";

std::vector<container_t> etalon = {DEF_VAL, REDEF_VAL};
const size_t n_times = etalon.size();
std::vector<container_t> output(n_times);

bool passed = true;
for (int i = 0; i < n_times; i++) {
try {
sycl::buffer<container_t, 1> buf(output.data(), output.size());

q.submit([&](sycl::handler &cgh) {
auto acc = buf.get_access<sycl::access::mode::write>(cgh);
if (i % 2 != 0)
cgh.set_specialization_constant<ConstID>(REDEF_VAL);
cgh.single_task<TestKernel>([=](kernel_handler kh) SYCL_ESIMD_KERNEL {
do_store(acc, i, kh.get_specialization_constant<ConstID>());
});
});
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}

if (output[i] != etalon[i]) {
passed = false;
std::cout << "comparison error -- case #" << i << " -- ";
std::cout << "output: " << output[i] << ", ";
std::cout << "etalon: " << etalon[i] << std::endl;
}
}

if (passed) {
std::cout << "passed" << std::endl;
return 0;
}

std::cout << "FAILED" << std::endl;
return 1;
}
6 changes: 6 additions & 0 deletions SYCL/ESIMD/spec_const/spec_const_ushort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
// type size.
// RUN: %clangxx -fsycl -I%S/.. %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %clangxx -fsycl -I%S/.. -DSYCL2020 %s -o %t.2020.out
// RUN: %GPU_RUN_PLACEHOLDER %t.2020.out
// UNSUPPORTED: cuda

#include <cstdint>
Expand All @@ -27,4 +29,8 @@
using spec_const_t = uint16_t;
using container_t = uint16_t;

#ifndef SYCL2020
#include "Inputs/spec_const_common.hpp"
#else
#include "Inputs/spec-const-2020-common.hpp"
#endif