Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[ESIMD] Add tests for lsc_gather() lsc_slm_gather() with merging sem… #1632

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions SYCL/ESIMD/esimd_test_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#pragma once

#include <sycl/ext/intel/experimental/esimd/tfloat32.hpp>
#include <sycl/ext/intel/esimd.hpp>
#include <sycl/sycl.hpp>
#define NOMINMAX

Expand Down Expand Up @@ -277,7 +277,7 @@ void display_timing_stats(double const *kernelTime,
std::cout << "[OverallTime][Primary]: " << overallTime << "\n";
}

// Get signed integer of given byte size.
// Get signed integer of given byte size or 'void'.
template <int N>
using int_type_t = std::conditional_t<
N == 1, int8_t,
Expand All @@ -286,6 +286,15 @@ using int_type_t = std::conditional_t<
std::conditional_t<N == 4, int32_t,
std::conditional_t<N == 8, int64_t, void>>>>;

// Get unsigned integer type of given byte size or 'void'.
template <int N>
using uint_type_t = std::conditional_t<
N == 1, uint8_t,
std::conditional_t<
N == 2, uint16_t,
std::conditional_t<N == 4, uint32_t,
std::conditional_t<N == 8, uint64_t, void>>>>;

enum class BinaryOp {
add,
sub,
Expand Down Expand Up @@ -578,4 +587,27 @@ TID(sycl::ext::intel::experimental::esimd::tfloat32)
TID(float)
TID(double)

std::string toString(sycl::ext::intel::experimental::esimd::lsc_data_size DS) {
switch (DS) {
case sycl::ext::intel::experimental::esimd::lsc_data_size::default_size:
return "lsc_data_size::default";
case sycl::ext::intel::experimental::esimd::lsc_data_size::u8:
return "lsc_data_size::u8";
case sycl::ext::intel::experimental::esimd::lsc_data_size::u16:
return "lsc_data_size::u16";
case sycl::ext::intel::experimental::esimd::lsc_data_size::u32:
return "lsc_data_size::u32";
case sycl::ext::intel::experimental::esimd::lsc_data_size::u64:
return "lsc_data_size::u64";
case sycl::ext::intel::experimental::esimd::lsc_data_size::u8u32:
return "lsc_data_size::u8u32";
case sycl::ext::intel::experimental::esimd::lsc_data_size::u16u32:
return "lsc_data_size::u16u32";
case sycl::ext::intel::experimental::esimd::lsc_data_size::u16u32h:
return "lsc_data_size::u16u32h";
}
assert(false && "Unknown lsc_data_size");
return "INVALID lsc_data_size";
}

} // namespace esimd_test
149 changes: 78 additions & 71 deletions SYCL/ESIMD/lsc/Inputs/lsc_slm_load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,18 @@

#include <iostream>

#include "../../esimd_test_utils.hpp"

using namespace sycl;
using namespace sycl::ext::intel::esimd;
using namespace sycl::ext::intel::experimental::esimd;

// TODO: The SPEC does not say what values are returned for lsc_slm_gather
// when the corresponding elements of the predicate/mask is zero.
// It is assumed to be undefined values there.
// Thus this test does not check those elements now. From the API point of view
// it may be better to have another argument for the values being copied to
// the result when the mask bit is 0.

template <int CaseNum, typename T, uint32_t Groups, uint32_t LocalRange,
uint16_t VL, uint16_t NChannels, bool Transpose,
template <typename T, uint32_t Groups, uint32_t LocalRange, uint16_t VL,
uint16_t NChannels, bool Transpose, bool TestMergeOperand,
lsc_data_size DS = lsc_data_size::default_size>
bool test(uint32_t PMask = ~0) {
bool test(queue Q, uint32_t PMask = ~0) {
using Tuint = esimd_test::uint_type_t<sizeof(T)>;

static_assert((NChannels == 1) || !Transpose,
"Transpose must have exec size 1");
if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
Expand All @@ -35,101 +32,111 @@ bool test(uint32_t PMask = ~0) {
"vector size is greater than 1");
}

T VMask = static_cast<T>(-1);
std::cout << "Running test: T=" << esimd_test::type_name<T>() << ", VL=" << VL
<< ", NChannels=" << NChannels
<< ", DS=" << esimd_test::toString(DS)
<< ", Transpose=" << Transpose
<< ", TestMergeOperand=" << TestMergeOperand
<< ", Groups=" << Groups << ", LocalRange=" << LocalRange
<< std::endl;

Tuint VMask = static_cast<Tuint>(-1);
if constexpr (DS == lsc_data_size::u8u32)
VMask = static_cast<T>(0xff);
VMask = 0xff;
else if constexpr (DS == lsc_data_size::u16u32)
VMask = static_cast<T>(0xffff);
VMask = 0xffff;
else if constexpr (DS == lsc_data_size::u16u32h)
VMask = static_cast<T>(0xffff0000);

queue Q(gpu_selector_v);
auto D = Q.get_device();
std::cout << "Running case #" << CaseNum << " on "
<< D.get_info<sycl::info::device::name>() << std::endl;
VMask = 0xffff0000;

nd_range<1> Range{range<1>{Groups * LocalRange}, range<1>{LocalRange}};
constexpr uint16_t OutSize = Groups * LocalRange * VL * NChannels;
uint32_t OutSize = Groups * LocalRange * VL * NChannels;
T *Out = malloc_shared<T>(OutSize, Q);
memset(Out, 0, OutSize * sizeof(T));
T MergeValue = 2;

try {
Q.submit([&](handler &cgh) {
cgh.parallel_for(Range, [=](sycl::nd_item<1> NDId) SYCL_ESIMD_KERNEL {
uint32_t GID = NDId.get_global_id(0);
uint32_t LID = NDId.get_local_id(0);
uint32_t GroupID = NDId.get_group_linear_id();

// Allocate and init 128-byte multiple size SLM memory with
// consequential values. i-th group gets values:
// {0, 1, 2, ...} + GroupID * 1000000.
constexpr uint32_t ResultSIMDByteSize = VL * NChannels * sizeof(T);
constexpr uint32_t SLMSize =
(ResultSIMDByteSize * LocalRange + 127) & ~127;
slm_init(SLMSize);
if (NDId.get_local_id(0) == 0) {
simd<T, 4> Vals(GroupID * 1000000, 1);
for (int I = 0; I < SLMSize; I += 4 * sizeof(T)) {
slm_block_store<T, 4>(I, Vals);
Vals += 4;
}
Q.parallel_for(Range, [=](sycl::nd_item<1> NDId) SYCL_ESIMD_KERNEL {
uint32_t GID = NDId.get_global_id(0);
uint32_t LID = NDId.get_local_id(0);
uint32_t GroupID = NDId.get_group_linear_id();

// Allocate and init 128-byte multiple size SLM memory with
// consequential values. i-th group gets values:
// {0, 1, 2, ...} + GroupID * 1000000.
constexpr uint32_t ResultSIMDByteSize = VL * NChannels * sizeof(T);
constexpr uint32_t SLMSize =
(ResultSIMDByteSize * LocalRange + 127) & ~127;
slm_init<SLMSize>();
if (NDId.get_local_id(0) == 0) {
simd<Tuint, 4> Vals(GroupID * 1000000, 1);
for (int I = 0; I < SLMSize; I += 4 * sizeof(T)) {
slm_block_store<Tuint, 4>(I, Vals);
Vals += 4;
}
barrier();

if constexpr (Transpose) {
auto Vals = lsc_slm_block_load<T, VL, DS>(LID * VL * sizeof(T));
Vals.copy_to(Out + GID * VL);
}
barrier();

if constexpr (Transpose) {
auto Vals = lsc_slm_block_load<T, VL, DS>(LID * VL * sizeof(T));
Vals.copy_to(Out + GID * VL);
} else {
simd<uint32_t, VL> Offsets(LID * VL * NChannels * sizeof(T),
NChannels * sizeof(T));

// Create the predicate for the gather from 'PMask'.
simd_mask<VL> Pred;
for (int I = 0; I < VL; I++)
Pred.template select<1, 1>(I) = (PMask >> I) & 1;

simd<T, VL * NChannels> Vals;
if constexpr (TestMergeOperand) {
simd<T, VL *NChannels> OldVals = MergeValue;
Vals = lsc_slm_gather<T, NChannels, DS>(Offsets, Pred, OldVals);
} else {
simd<uint32_t, VL> Offsets(LID * VL * NChannels * sizeof(T),
NChannels * sizeof(T));

// Create the predicate for the gather from 'PMask'.
simd_mask<VL> Pred;
for (int I = 0; I < VL; I++)
Pred.template select<1, 1>(I) = (PMask >> I) & 1;

simd<T, VL *NChannels> Vals =
lsc_slm_gather<T, NChannels, DS>(Offsets, Pred);

Vals.copy_to(Out + GID * VL * NChannels);
Vals = lsc_slm_gather<T, NChannels, DS>(Offsets, Pred);
}
});

Vals.copy_to(Out + GID * VL * NChannels);
}
}).wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
sycl::free(Out, Q);
return false;
}

bool Passed = true;
int NErrors = 0;

if constexpr (Transpose) {
for (uint32_t I = 0; I < OutSize; I++) {
uint32_t GroupId = I / (LocalRange * VL * NChannels);
uint32_t LID = I % (LocalRange * VL * NChannels);
T ExpectedVal = GroupId * 1000000 + LID;
if (Out[I] != ExpectedVal) {
Passed = false;
std::cout << I << ": Value = " << Out[I]
if (Out[I] != ExpectedVal && NErrors++ < 32) {
std::cout << "Error: " << I << ": Value = " << Out[I]
<< ", Expected value = " << ExpectedVal << std::endl;
}
}
} else {
for (uint32_t I = 0; I < OutSize; I += VL * NChannels) {
uint32_t GroupId = I / (LocalRange * VL * NChannels);
uint32_t LID = I % (LocalRange * VL * NChannels);
T ExpectedValBase = GroupId * 1000000 + LID;
Tuint ExpectedValBase = GroupId * 1000000 + LID;
for (int ChannelId = 0; ChannelId < NChannels; ChannelId++) {
for (int J = 0; J < VL; J++) {
uint32_t OutIndex = I + ChannelId * VL + J;

if (((PMask >> J) & 1) == 0)
bool IsMaskSet = (PMask >> J) & 1;
if (!TestMergeOperand && !IsMaskSet)
continue;
T ExpectedVal = (ExpectedValBase + ChannelId + J * NChannels) & VMask;
if (Out[OutIndex] != ExpectedVal) {
Passed = false;
std::cout << OutIndex << ": Value = " << Out[OutIndex]
<< ", Expected value = " << ExpectedVal << std::endl;
Tuint ExpectedVal =
IsMaskSet ? (ExpectedValBase + ChannelId + J * NChannels) & VMask
: sycl::bit_cast<Tuint>(MergeValue);
Tuint ComputedVal = sycl::bit_cast<Tuint>(Out[OutIndex]);
if (ComputedVal != ExpectedVal && NErrors++ < 32) {
std::cout << "Error: " << OutIndex << ": Value = " << ComputedVal
<< ", Expected value = " << ExpectedVal
<< ", Mask = " << IsMaskSet << std::endl;
}
}
}
Expand All @@ -138,7 +145,7 @@ bool test(uint32_t PMask = ~0) {

sycl::free(Out, Q);

if (!Passed)
std::cout << "Case #" << CaseNum << " FAILED" << std::endl;
return Passed;
if (NErrors)
std::cout << " FAILED" << std::endl;
return NErrors == 0;
}
64 changes: 36 additions & 28 deletions SYCL/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,15 @@ template <typename T, uint16_t N,
lsc_data_size DS = lsc_data_size::default_size,
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
bool UsePrefetch = false, bool UseOldValuesOperand = true>
bool test(uint32_t Groups, uint32_t Threads) {
bool test(queue Q, uint32_t Groups, uint32_t Threads) {
static_assert(DS != lsc_data_size::u8u32 && DS != lsc_data_size::u16u32,
"unsupported DS for lsc_block_load()");
static_assert(DS != lsc_data_size::u16u32h, "D16U32h not supported in HW");

uint32_t Size = Groups * Threads * N;
using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;

auto Q = queue{gpu_selector_v};
auto D = Q.get_device();
std::cout << "Running on " << D.get_info<sycl::info::device::name>()
<< ", T=" << esimd_test::type_name<T>() << ",N=" << N
std::cout << "Running case: T=" << esimd_test::type_name<T>() << ",N=" << N
<< ",UsePrefetch=" << UsePrefetch
<< ",UseOldValuesOperand=" << UseOldValuesOperand;

Expand Down Expand Up @@ -121,46 +118,57 @@ template <typename T> bool test_lsc_block_load() {
constexpr bool CheckMerge = true;
constexpr bool NoCheckMerge = false;

auto Q = queue{gpu_selector_v};
std::cout << "Running lsc_block_load() tests for T="
<< esimd_test::type_name<T>() << " on "
<< Q.get_device().get_info<sycl::info::device::name>() << std::endl;

bool Passed = true;
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(1, 4);
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(1, 4);
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(2, 2);
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(2, 8);
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(3, 3);
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 1, 4);
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 1, 4);
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 2, 2);
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 2, 8);
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 3, 3);
if constexpr (sizeof(T) * 2 >= sizeof(int))
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(5, 5);
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 5, 5);
if constexpr (sizeof(T) >= sizeof(int))
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(3, 5);
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 3, 5);

Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, CheckMerge>(1, 4);
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, CheckMerge>(2, 2);
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, CheckMerge>(4, 4);
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, CheckMerge>(2, 8);
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, CheckMerge>(3, 3);
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 1, 4);
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 2, 2);
Passed &= test<T, 16, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 4, 4);
Passed &= test<T, 8, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 2, 8);
Passed &= test<T, 4, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 3, 3);
if constexpr (sizeof(T) * 2 >= sizeof(int))
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, CheckMerge>(5, 5);
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 5, 5);
if constexpr (sizeof(T) >= sizeof(int))
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, CheckMerge>(3, 5);
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 3, 5);

return Passed;
}

template <typename T, lsc_data_size DS = lsc_data_size::default_size>
bool test_lsc_prefetch() {
template <typename T, lsc_data_size DS = lsc_data_size::default_size,
bool IsGatherLikePrefetch = false>
std::enable_if_t<!IsGatherLikePrefetch, bool> test_lsc_prefetch() {
constexpr cache_hint L1H = cache_hint::cached;
constexpr cache_hint L3H = cache_hint::uncached;
constexpr bool DoPrefetch = true;

auto Q = queue{gpu_selector_v};
std::cout << "Running block-load-like lsc_prefetch() tests for T="
<< esimd_test::type_name<T>() << " on "
<< Q.get_device().get_info<sycl::info::device::name>() << std::endl;

bool Passed = true;
Passed &= test<T, 64, DS, L1H, L3H, DoPrefetch>(1, 4);
Passed &= test<T, 32, DS, L1H, L3H, DoPrefetch>(1, 4);
Passed &= test<T, 16, DS, L1H, L3H, DoPrefetch>(2, 2);
Passed &= test<T, 8, DS, L1H, L3H, DoPrefetch>(2, 8);
Passed &= test<T, 4, DS, L1H, L3H, DoPrefetch>(3, 3);
Passed &= test<T, 64, DS, L1H, L3H, DoPrefetch>(Q, 1, 4);
Passed &= test<T, 32, DS, L1H, L3H, DoPrefetch>(Q, 1, 4);
Passed &= test<T, 16, DS, L1H, L3H, DoPrefetch>(Q, 2, 2);
Passed &= test<T, 8, DS, L1H, L3H, DoPrefetch>(Q, 2, 8);
Passed &= test<T, 4, DS, L1H, L3H, DoPrefetch>(Q, 3, 3);
if constexpr (sizeof(T) * 2 >= sizeof(int))
Passed &= test<T, 2, DS, L1H, L3H, DoPrefetch>(5, 5);
Passed &= test<T, 2, DS, L1H, L3H, DoPrefetch>(Q, 5, 5);
if constexpr (sizeof(T) >= sizeof(int))
Passed &= test<T, 1, DS, L1H, L3H, DoPrefetch>(3, 5);
Passed &= test<T, 1, DS, L1H, L3H, DoPrefetch>(Q, 3, 5);

return Passed;
}
Loading