Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Test extension group load/store span-based API #1539

Open
wants to merge 5 commits into
base: intel
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
370 changes: 370 additions & 0 deletions SYCL/GroupLoadStore/group_load_store_api.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,370 @@
// RUN: %clangxx -fsycl %s -o %t.out
// Test for group load/store functionality with span-based API
// TODO: Add stripped case

#include <algorithm>
#include <cassert>
#include <limits>
#include <numeric>
#include <sycl/sycl.hpp>

namespace sycl_exp = sycl::ext::oneapi::experimental;

constexpr std::size_t block_size = 32;
constexpr std::size_t items_per_thread = 3;
constexpr std::size_t block_count = 2;
constexpr std::size_t size = block_count * block_size * items_per_thread;

template <typename InputContainer, typename OutputContainer>
void test_single_value(sycl::queue q, InputContainer &input,
OutputContainer &output) {
typedef typename InputContainer::value_type InputT;
typedef typename OutputContainer::value_type OutputT;
{
sycl::buffer<InputT> in_buf(input.data(), input.size());
sycl::buffer<OutputT> out_buf(output.data(), output.size());

q.submit([&](sycl::handler &cgh) {
sycl::accessor in(in_buf, cgh, sycl::read_only);
sycl::accessor out(out_buf, cgh, sycl::write_only);

cgh.parallel_for(
sycl::nd_range<1>(size, block_size), [=](sycl::nd_item<1> item) {
auto group = item.get_group();

InputT data;

auto offset = group.get_group_id(0) * group.get_local_range(0);

sycl_exp::group_load(group, in.get_pointer() + offset, data);

data += item.get_global_linear_id() * 100000;

sycl_exp::group_store(group, data, out.get_pointer() + offset);
});
});
}
}

template <typename InputContainer, typename OutputContainer>
void test_vec(sycl::queue q, InputContainer &input, OutputContainer &output) {
typedef typename InputContainer::value_type InputT;
typedef typename OutputContainer::value_type OutputT;
{
sycl::buffer<InputT> in_buf(input.data(), input.size());
sycl::buffer<OutputT> out_buf(output.data(), output.size());

q.submit([&](sycl::handler &cgh) {
sycl::accessor in(in_buf, cgh, sycl::read_only);
sycl::accessor out(out_buf, cgh, sycl::write_only);

cgh.parallel_for(
sycl::nd_range<1>(block_count * block_size, block_size),
[=](sycl::nd_item<1> item) {
auto group = item.get_group();

auto offset = group.get_group_id(0) * group.get_local_range(0) *
items_per_thread;

sycl::vec<InputT, items_per_thread> data;

sycl_exp::group_load(group, in.get_pointer() + offset, data);

for (int i = 0; i < items_per_thread; ++i) {
data[i] += item.get_global_linear_id() * 100000 + i * 1000;
}

sycl_exp::group_store(group, data, out.get_pointer() + offset);
});
});
}
}

template <typename InputContainer, typename OutputContainer>
void test_no_mem(sycl::queue q, InputContainer &input,
OutputContainer &output) {
typedef typename InputContainer::value_type InputT;
typedef typename OutputContainer::value_type OutputT;
{
sycl::buffer<InputT> in_buf(input.data(), input.size());
sycl::buffer<OutputT> out_buf(output.data(), output.size());

q.submit([&](sycl::handler &cgh) {
sycl::accessor in(in_buf, cgh, sycl::read_only);
sycl::accessor out(out_buf, cgh, sycl::write_only);

cgh.parallel_for(sycl::nd_range<1>(block_count * block_size, block_size),
[=](sycl::nd_item<1> item) {
auto group = item.get_group();

auto offset = group.get_group_id(0) *
group.get_local_range(0) *
items_per_thread;

InputT data[items_per_thread];

sycl_exp::group_load(group, in.get_pointer() + offset,
sycl::span{data});

for (int i = 0; i < items_per_thread; ++i) {
data[i] +=
item.get_global_linear_id() * 100000 + i * 1000;
}

sycl_exp::group_store(group, sycl::span{data},
out.get_pointer() + offset);
});
});
}
}

template <typename InputContainer, typename OutputContainer>
void test_local_acc(sycl::queue q, InputContainer &input,
OutputContainer &output) {
typedef typename InputContainer::value_type InputT;
typedef typename OutputContainer::value_type OutputT;
{
sycl::buffer<InputT> in_buf(input.data(), input.size());
sycl::buffer<OutputT> out_buf(output.data(), output.size());

q.submit([&](sycl::handler &cgh) {
sycl::accessor in(in_buf, cgh, sycl::read_only);
sycl::accessor out(out_buf, cgh, sycl::write_only);
constexpr auto temp_memory_size =
sycl_exp::memory_required<InputT, items_per_thread>(
sycl::memory_scope::work_group, block_size);
sycl::local_accessor<std::byte> buf(temp_memory_size, cgh);
cgh.parallel_for(sycl::nd_range<1>(block_count * block_size, block_size),
[=](sycl::nd_item<1> item) {
auto group = item.get_group();

auto offset = group.get_group_id(0) *
group.get_local_range(0) *
items_per_thread;

InputT data[items_per_thread];
std::byte *buf_ptr = buf.get_pointer().get();
sycl_exp::group_with_scratchpad gh{
group, sycl::span{buf_ptr, temp_memory_size}};

sycl_exp::group_load(gh, in.get_pointer() + offset,
sycl::span{data});

for (int i = 0; i < items_per_thread; ++i) {
data[i] +=
item.get_global_linear_id() * 100000 + i * 1000;
}

sycl_exp::group_store(gh, sycl::span{data},
out.get_pointer() + offset);
});
});
}
}

template <typename InputContainer, typename OutputContainer>
void test_group_local_memory(sycl::queue q, InputContainer &input,
OutputContainer &output) {
typedef typename InputContainer::value_type InputT;
typedef typename OutputContainer::value_type OutputT;
{
sycl::buffer<InputT> in_buf(input.data(), input.size());
sycl::buffer<OutputT> out_buf(output.data(), output.size());

q.submit([&](sycl::handler &cgh) {
sycl::accessor in(in_buf, cgh, sycl::read_only);
sycl::accessor out(out_buf, cgh, sycl::write_only);
constexpr auto temp_memory_size =
sycl_exp::memory_required<InputT, items_per_thread>(
sycl::memory_scope::work_group, block_size);
cgh.parallel_for(sycl::nd_range<1>(block_count * block_size, block_size),
[=](sycl::nd_item<1> item) {
auto group = item.get_group();

auto offset = group.get_group_id(0) *
group.get_local_range(0) *
items_per_thread;

InputT data[items_per_thread];
auto scratch = sycl::ext::oneapi::group_local_memory<
std::byte[temp_memory_size]>(group);
std::byte *buf_ptr = (std::byte *)(scratch.get());

sycl_exp::group_with_scratchpad gh{
group, sycl::span{buf_ptr, temp_memory_size}};

sycl_exp::group_load(gh, in.get_pointer() + offset,
sycl::span{data});

for (int i = 0; i < items_per_thread; ++i) {
data[i] +=
item.get_global_linear_id() * 100000 + i * 1000;
}

sycl_exp::group_store(gh, sycl::span{data},
out.get_pointer() + offset);
});
});
}
}

template <typename InputContainer, typename OutputContainer>
void test_marray(sycl::queue q, InputContainer &input,
OutputContainer &output) {
typedef typename InputContainer::value_type InputT;
typedef typename OutputContainer::value_type OutputT;
{
sycl::buffer<InputT> in_buf(input.data(), input.size());
sycl::buffer<OutputT> out_buf(output.data(), output.size());

q.submit([&](sycl::handler &cgh) {
sycl::accessor in(in_buf, cgh, sycl::read_only);
sycl::accessor out(out_buf, cgh, sycl::write_only);
cgh.parallel_for(
sycl::nd_range<1>(block_count * block_size, block_size),
[=](sycl::nd_item<1> item) {
auto group = item.get_group();

auto offset = group.get_group_id(0) * group.get_local_range(0) *
items_per_thread;

sycl::marray<InputT, items_per_thread> data;

sycl_exp::group_load(
group, in.get_pointer() + offset,
sycl::span<InputT, items_per_thread>{data.begin(), data.end()});

for (int i = 0; i < items_per_thread; ++i) {
data[i] += item.get_global_linear_id() * 100000 + i * 1000;
}

sycl_exp::group_store(
group,
sycl::span<InputT, items_per_thread>{data.begin(), data.end()},
out.get_pointer() + offset);
});
});
}
}

template <typename InputContainer, typename OutputContainer>
void test_vec_span_api(sycl::queue q, InputContainer &input,
OutputContainer &output) {
typedef typename InputContainer::value_type InputT;
typedef typename OutputContainer::value_type OutputT;
{
sycl::buffer<InputT> in_buf(input.data(), input.size());
sycl::buffer<OutputT> out_buf(output.data(), output.size());

q.submit([&](sycl::handler &cgh) {
sycl::accessor in(in_buf, cgh, sycl::read_only);
sycl::accessor out(out_buf, cgh, sycl::write_only);
cgh.parallel_for(
sycl::nd_range<1>(block_count * block_size, block_size),
[=](sycl::nd_item<1> item) {
auto group = item.get_group();

auto offset = group.get_group_id(0) * group.get_local_range(0) *
items_per_thread;

sycl::vec<InputT, items_per_thread> data;

sycl_exp::group_load(group, in.get_pointer() + offset,
sycl::span<InputT, items_per_thread>{
&data[0], &data[0] + items_per_thread});

for (int i = 0; i < items_per_thread; ++i) {
data[i] += item.get_global_linear_id() * 100000 + i * 1000;
}

sycl_exp::group_store(group,
sycl::span<InputT, items_per_thread>{
&data[0], &data[0] + items_per_thread},
out.get_pointer() + offset);
});
});
}
}

template <typename InputContainer, typename OutputContainer>
int check_correctness_single_value(InputContainer &input,
OutputContainer &output,
std::string test_name) {
for (int i = 0; i < input.size(); i++) {
if ((input[i] + i * 100000) != output[i]) {
std::cout << i << " " << input[i] << " " << output[i] << std::endl;
std::cout << test_name << " test failed" << std::endl;
return 1;
}
}
std::cout << test_name << " test passed" << std::endl;
return 0;
}

template <typename InputContainer, typename OutputContainer>
int check_correctness(InputContainer &input, OutputContainer &output,
std::string test_name) {
for (int i = 0; i < input.size() / items_per_thread; i++) {
for (int j = 0; j < items_per_thread; j++) {
int idx = i * items_per_thread + j;
if ((input[idx] + i * 100000 + j * 1000) != output[idx]) {
std::cout << i << " " << input[idx] << " " << output[idx] << std::endl;
std::cout << test_name << " test failed" << std::endl;
return 1;
}
}
}
std::cout << test_name << " test passed" << std::endl;
return 0;
}

int main() {
sycl::queue q;

std::vector<int> input(size);
std::vector<int> output(size);

std::iota(input.begin(), input.end(), 0);
std::fill(output.begin(), output.end(), 0);

test_single_value(q, input, output);
assert(check_correctness_single_value(input, output, "single value") == 0);

std::iota(input.begin(), input.end(), 0);
std::fill(output.begin(), output.end(), 0);

test_vec(q, input, output);
assert(check_correctness(input, output, "sycl::vec") == 0);

std::iota(input.begin(), input.end(), 0);
std::fill(output.begin(), output.end(), 0);

test_no_mem(q, input, output);
assert(check_correctness(input, output, "No local memory") == 0);

std::iota(input.begin(), input.end(), 0);
std::fill(output.begin(), output.end(), 0);

test_local_acc(q, input, output);
assert(check_correctness(input, output, "Local accessor") == 0);

std::iota(input.begin(), input.end(), 0);
std::fill(output.begin(), output.end(), 0);

test_group_local_memory(q, input, output);
assert(check_correctness(input, output, "Group local memory") == 0);

std::iota(input.begin(), input.end(), 0);
std::fill(output.begin(), output.end(), 0);

test_marray(q, input, output);
assert(check_correctness(input, output, "sycl::marray") == 0);

std::iota(input.begin(), input.end(), 0);
std::fill(output.begin(), output.end(), 0);

test_vec_span_api(q, input, output);
assert(check_correctness(input, output, "sycl::vec span api") == 0);

std::cout << "All tests passed" << std::endl;
}