Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Add tests for SYCL2020 queue::parallel_for() shortcuts #405

Merged
merged 4 commits into from
May 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions SYCL/Reduction/reduction_nd_N_queue_shortcut.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out

// `Group algorithms are not supported on host device.` on NVidia.
// XFAIL: hip_nvidia

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this is still applicable. Should we rerun the tests?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think normal CI would report/complain on this test if it passed on nvidia, right?

Copy link
Author

@v-klochkov v-klochkov May 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, this test copies that from the existing test (being removed/replaced in this PR): SYCL/Reduction/reduction_queue_parallel_for.cpp . If it passed, it would be seen during CI runs.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is the case for other similar tests, then I suspect you are right. If it starts unexpectedly passing we'll know to remove it. 😄


// RUNx: %HOST_RUN_PLACEHOLDER %t.out
// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and
// barrier()

// This test only checks that the method queue::parallel_for() accepting
// reduction, can be properly translated into queue::submit + parallel_for().

#include "reduction_utils.hpp"

using namespace sycl;

template <typename T, int I> class KName;

enum TestCase { NoDependencies, Dependency, DependenciesVector };

template <typename T> T *allocUSM(queue &Q, size_t Size) {
if (!Q.get_device().has(getUSMAspect(usm::alloc::shared)))
return nullptr;

return malloc_shared<T>(Size, Q);
}

template <typename RangeT>
void printNVarsTestLabel(bool IsSYCL2020, const RangeT &Range,
bool ToCERR = false) {
std::ostream &OS = ToCERR ? std::cerr : std::cout;
std::string Mode = IsSYCL2020 ? "SYCL2020" : "ONEAPI ";
OS << (ToCERR ? "Error" : "Start") << ": Mode=" << Mode
<< ", Range=" << Range;
if (!ToCERR)
OS << std::endl;
}

template <typename T1, typename T2, TestCase TC, int Dims, typename BOpT1,
typename BOpT2>
int test(queue &Q, BOpT1 BOp1, BOpT2 BOp2, const nd_range<Dims> &Range) {
printNVarsTestLabel(true /*SYCL2020*/, Range);

size_t NElems = Range.get_global_range().size();
T1 *Sum1 = allocUSM<T1>(Q, 1);
T2 *Sum2 = allocUSM<T2>(Q, 1);
T1 *Arr1 = allocUSM<T1>(Q, NElems);
T2 *Arr2 = allocUSM<T2>(Q, NElems);
if (!Sum1 || !Sum2 || !Arr1 || !Arr2) {
sycl::free(Sum1, Q);
sycl::free(Sum2, Q);
sycl::free(Arr1, Q);
sycl::free(Arr2, Q);
std::cout << " SKIPPED due to unrelated problems with USM" << std::endl;
return 0;
}

*Sum2 = 0;
auto R1 = sycl::reduction(
Sum1, BOp1, property_list(property::reduction::initialize_to_identity{}));
auto R2 = sycl::reduction(Sum2, static_cast<T2>(0), BOp2);

if constexpr (TC == TestCase::NoDependencies) {
std::fill(Arr1, Arr1 + NElems, 1);
std::fill(Arr2, Arr2 + NElems, 2);
Q.parallel_for(Range, R1, R2,
[=](nd_item<Dims> It, auto &Sum1, auto &Sum2) {
size_t LinId = It.get_global_linear_id();
Sum1.combine(static_cast<T1>(LinId) + Arr1[LinId]);
Sum2.combine(static_cast<T2>(LinId) + Arr2[LinId]);
})
.wait();
} else if constexpr (TC == TestCase::Dependency) {
auto E = Q.single_task([=]() {
std::fill(Arr1, Arr1 + NElems, 1);
std::fill(Arr2, Arr2 + NElems, 2);
});
Q.parallel_for(Range, E, R1, R2,
[=](nd_item<Dims> It, auto &Sum1, auto &Sum2) {
size_t LinId = It.get_global_linear_id();
Sum1.combine(static_cast<T1>(LinId) + Arr1[LinId]);
Sum2.combine(static_cast<T2>(LinId) + Arr2[LinId]);
})
.wait();
} else {
auto E1 = Q.single_task([=]() { std::fill(Arr1, Arr1 + NElems, 1); });
auto E2 = Q.single_task([=]() { std::fill(Arr2, Arr2 + NElems, 2); });
std::vector<event> EVec{E1, E2};
Q.parallel_for(Range, EVec, R1, R2,
[=](nd_item<Dims> It, auto &Sum1, auto &Sum2) {
size_t LinId = It.get_global_linear_id();
Sum1.combine(static_cast<T1>(LinId) + Arr1[LinId]);
Sum2.combine(static_cast<T2>(LinId) + Arr2[LinId]);
})
.wait();
}

T1 ExpectedSum1 = NElems + (NElems - 1) * NElems / 2;
T2 ExpectedSum2 = 2 * NElems + (NElems - 1) * NElems / 2;
std::string AddInfo = "TestCase=";
int Error = checkResults(Q, true /*SYCL2020*/, BOp1, Range, *Sum1,
ExpectedSum1, AddInfo + std::to_string(1));
Error += checkResults(Q, true /*SYCL2020*/, BOp2, Range, *Sum2, ExpectedSum2,
AddInfo + std::to_string(2));

sycl::free(Sum1, Q);
sycl::free(Sum2, Q);
sycl::free(Arr1, Q);
sycl::free(Arr2, Q);
return Error;
}

template <typename T1, typename T2, int Dims, typename BinaryOperation1,
typename BinaryOperation2>
int tests(queue &Q, BinaryOperation1 BOp1, BinaryOperation2 BOp2,
const nd_range<Dims> &Range) {
int NumErrors = 0;
NumErrors += test<T1, T2, TestCase::NoDependencies>(Q, BOp1, BOp2, Range);
NumErrors += test<T1, T2, TestCase::Dependency>(Q, BOp1, BOp2, Range);
NumErrors += test<T1, T2, TestCase::DependenciesVector>(Q, BOp1, BOp2, Range);
return NumErrors;
}

int main() {
queue Q;
printDeviceInfo(Q);

int NumErrors = 0;
auto LambdaSum = [](auto X, auto Y) { return (X + Y); };

NumErrors +=
tests<int, short>(Q, std::plus<>{}, LambdaSum, nd_range<1>{32, 16});
NumErrors += tests<int, short>(Q, std::plus<>{}, LambdaSum,
nd_range<2>{range<2>{4, 4}, range<2>{2, 2}});
NumErrors +=
tests<int, short>(Q, std::plus<>{}, LambdaSum,
nd_range<3>{range<3>{4, 4, 3}, range<3>{1, 2, 3}});

printFinalStatus(NumErrors);
return NumErrors;
}
97 changes: 97 additions & 0 deletions SYCL/Reduction/reduction_nd_queue_shortcut.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out

// `Group algorithms are not supported on host device.` on NVidia.
// XFAIL: hip_nvidia

// RUNx: %HOST_RUN_PLACEHOLDER %t.out
// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and
// barrier()

// This test only checks that the method queue::parallel_for() accepting
// reduction, can be properly translated into queue::submit + parallel_for().

#include "reduction_utils.hpp"

using namespace sycl;

template <typename T, int I> class KName;

enum TestCase { NoDependencies, Dependency, DependenciesVector };

template <typename T> T *allocUSM(queue &Q, size_t Size) {
if (!Q.get_device().has(getUSMAspect(usm::alloc::shared)))
return nullptr;

return malloc_shared<T>(Size, Q);
}

template <typename T, TestCase TC, int Dims, typename BinaryOperation>
int test(queue &Q, BinaryOperation BOp, const nd_range<Dims> &Range) {
printTestLabel<T, BinaryOperation>(true /*SYCL2020*/, Range);

size_t NElems = Range.get_global_range().size();
T *Sum = allocUSM<T>(Q, 1);
T *Arr = allocUSM<T>(Q, NElems);
if (!Sum || !Arr) {
std::cout << " SKIPPED due to unrelated problems with USM" << std::endl;
sycl::free(Sum, Q);
sycl::free(Arr, Q);
return 0;
}

auto Redu = sycl::reduction(
Sum, BOp, property_list(property::reduction::initialize_to_identity{}));
if constexpr (TC == TestCase::NoDependencies) {
std::fill(Arr, Arr + NElems, 1);
Q.parallel_for(Range, Redu, [=](nd_item<Dims> It, auto &Sum) {
size_t LinId = It.get_global_linear_id();
Sum.combine(static_cast<T>(LinId) + Arr[LinId]);
}).wait();
} else if constexpr (TC == TestCase::Dependency) {
auto E = Q.single_task([=]() { std::fill(Arr, Arr + NElems, 1); });
Q.parallel_for(Range, E, Redu, [=](nd_item<Dims> It, auto &Sum) {
size_t LinId = It.get_global_linear_id();
Sum.combine(static_cast<T>(LinId) + Arr[LinId]);
}).wait();
} else {
auto E = Q.single_task([=]() { std::fill(Arr, Arr + NElems, 1); });
std::vector<event> EVec{E};
Q.parallel_for(Range, EVec, Redu, [=](nd_item<Dims> It, auto &Sum) {
size_t LinId = It.get_global_linear_id();
Sum.combine(static_cast<T>(LinId) + Arr[LinId]);
}).wait();
}

T ExpectedSum = NElems + (NElems - 1) * NElems / 2;
int Error = checkResults(Q, true /*SYCL2020*/, BOp, Range, *Sum, ExpectedSum);
free(Sum, Q);
free(Arr, Q);
return Error;
}

template <typename T, int Dims, typename BinaryOperation>
int tests(queue &Q, BinaryOperation BOp, const nd_range<Dims> &Range) {
int NumErrors = 0;
NumErrors += test<T, TestCase::NoDependencies>(Q, BOp, Range);
NumErrors += test<T, TestCase::Dependency>(Q, BOp, Range);
NumErrors += test<T, TestCase::DependenciesVector>(Q, BOp, Range);
return NumErrors;
}

int main() {
queue Q;
printDeviceInfo(Q);

int NumErrors = 0;
NumErrors += tests<int>(Q, std::plus<>{}, nd_range<1>{32, 16});
NumErrors +=
tests<int>(Q, std::plus<>{}, nd_range<2>{range<2>{4, 4}, range<2>{2, 2}});
NumErrors += tests<int>(Q, std::plus<>{},
nd_range<3>{range<3>{4, 4, 3}, range<3>{1, 2, 3}});

printFinalStatus(NumErrors);
return NumErrors;
}
60 changes: 0 additions & 60 deletions SYCL/Reduction/reduction_queue_parallel_for.cpp

This file was deleted.

Loading