Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit de7dc0a

Browse files
fineg74v-klochkov
andauthored
[SYCL][ESIMD]Add tests to validate lsc_block_load/store using 64 bit elements by default (#1590)
Co-authored-by: Vyacheslav Klochkov <[email protected]>
1 parent df9cb59 commit de7dc0a

10 files changed

+166
-38
lines changed

SYCL/ESIMD/lsc/Inputs/lsc_surf_load.hpp

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ using namespace sycl;
1717
using namespace sycl::ext::intel::esimd;
1818
using namespace sycl::ext::intel::experimental::esimd;
1919

20-
template <int case_num, typename T, uint32_t Groups, uint32_t Threads,
21-
uint16_t VL, uint16_t VS, bool transpose,
22-
lsc_data_size DS = lsc_data_size::default_size,
23-
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
24-
bool use_prefetch = false>
20+
template <
21+
int case_num, typename T, uint32_t Groups, uint32_t Threads, uint16_t VL,
22+
uint16_t VS, bool transpose, lsc_data_size DS = lsc_data_size::default_size,
23+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
24+
bool use_prefetch = false, typename Flags = __ESIMD_NS::overaligned_tag<4>>
2525
bool test(uint32_t pmask = 0xffffffff) {
2626
static_assert((VL == 1) || !transpose, "Transpose must have exec size 1");
2727
if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
@@ -61,8 +61,13 @@ bool test(uint32_t pmask = 0xffffffff) {
6161
sycl::range<1> LocalRange{Threads};
6262
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
6363

64-
std::vector<T> out(Size, old_val);
65-
std::vector<T> in(Size);
64+
using aligned_allocator =
65+
sycl::usm_allocator<T, sycl::usm::alloc::host,
66+
Flags::template alignment<__ESIMD_DNS::__raw_t<T>>>;
67+
aligned_allocator Allocator(q);
68+
69+
std::vector<T, aligned_allocator> out(Size, old_val, Allocator);
70+
std::vector<T, aligned_allocator> in(Size, Allocator);
6671
for (int i = 0; i < Size; i++)
6772
in[i] = get_rand<T>();
6873

@@ -83,12 +88,25 @@ bool test(uint32_t pmask = 0xffffffff) {
8388
simd<T, VS> vals;
8489
if constexpr (use_prefetch) {
8590
lsc_prefetch<T, VS, DS, L1H, L3H>(acci, byte_off);
86-
vals = lsc_block_load<T, VS, DS>(acci, byte_off);
91+
if constexpr (sizeof(T) < 8) {
92+
vals = lsc_block_load<T, VS, DS, L1H, L3H>(acci, byte_off,
93+
Flags{});
94+
} else {
95+
vals = lsc_block_load<T, VS, DS, L1H, L3H>(acci, byte_off);
96+
}
97+
} else {
98+
if constexpr (sizeof(T) < 8) {
99+
vals = lsc_block_load<T, VS, DS, L1H, L3H>(acci, byte_off,
100+
Flags{});
101+
} else {
102+
vals = lsc_block_load<T, VS, DS, L1H, L3H>(acci, byte_off);
103+
}
104+
}
105+
if constexpr (sizeof(T) < 8) {
106+
lsc_block_store<T, VS, DS>(acco, byte_off, vals, Flags{});
87107
} else {
88-
vals = lsc_block_load<T, VS, DS, L1H, L3H>(acci, byte_off);
108+
lsc_block_store<T, VS, DS>(acco, byte_off, vals);
89109
}
90-
lsc_block_store<T, VS, lsc_data_size::default_size>(
91-
acco, byte_off, vals);
92110
} else {
93111
simd<uint32_t, VL> offset(byte_off, VS * sizeof(T));
94112
simd_mask<VL> pred;

SYCL/ESIMD/lsc/Inputs/lsc_surf_store.hpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ using namespace sycl::ext::intel::experimental::esimd;
2020
template <int case_num, typename T, uint32_t Groups, uint32_t Threads,
2121
uint16_t VL, uint16_t VS, bool transpose,
2222
lsc_data_size DS = lsc_data_size::default_size,
23-
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
23+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
24+
typename Flags = __ESIMD_NS::overaligned_tag<4>>
2425
bool test(uint32_t pmask = 0xffffffff) {
2526
static_assert((VL == 1) || !transpose, "Transpose must have exec size 1");
2627
if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
@@ -60,8 +61,12 @@ bool test(uint32_t pmask = 0xffffffff) {
6061
// threads in each group
6162
sycl::range<1> LocalRange{Threads};
6263
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
64+
using aligned_allocator =
65+
sycl::usm_allocator<T, sycl::usm::alloc::host,
66+
Flags::template alignment<__ESIMD_DNS::__raw_t<T>>>;
67+
aligned_allocator Allocator(q);
6368

64-
std::vector<T> out(Size, old_val);
69+
std::vector<T, aligned_allocator> out(Size, old_val, Allocator);
6570

6671
try {
6772
buffer<T, 1> bufo(out.data(), out.size());
@@ -76,7 +81,12 @@ bool test(uint32_t pmask = 0xffffffff) {
7681

7782
if constexpr (transpose) {
7883
simd<T, VS> vals(new_val + elem_off, 1);
79-
lsc_block_store<T, VS, DS, L1H, L3H>(acco, byte_off, vals);
84+
if constexpr (sizeof(T) < 8) {
85+
lsc_block_store<T, VS, DS, L1H, L3H>(acco, byte_off, vals,
86+
Flags{});
87+
} else {
88+
lsc_block_store<T, VS, DS, L1H, L3H>(acco, byte_off, vals);
89+
}
8090
} else {
8191
simd<uint32_t, VL> offset(byte_off, VS * sizeof(T));
8292
simd_mask<VL> pred;

SYCL/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ using namespace sycl::ext::intel::experimental::esimd;
2121
template <typename T, uint16_t N,
2222
lsc_data_size DS = lsc_data_size::default_size,
2323
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
24-
bool UsePrefetch = false, bool UseOldValuesOperand = true>
24+
bool UsePrefetch = false, bool UseOldValuesOperand = true,
25+
typename Flags = __ESIMD_NS::overaligned_tag<4>>
2526
bool test(queue Q, uint32_t Groups, uint32_t Threads) {
2627
static_assert(DS != lsc_data_size::u8u32 && DS != lsc_data_size::u16u32,
2728
"unsupported DS for lsc_block_load()");
@@ -38,8 +39,10 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) {
3839
sycl::range<1> LocalRange{Threads};
3940
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
4041

41-
T *Out = sycl::malloc_shared<T>(Size, Q);
42-
T *In = sycl::malloc_shared<T>(Size, Q);
42+
T *Out = static_cast<T *>(sycl::aligned_alloc_shared(
43+
Flags::template alignment<__ESIMD_DNS::__raw_t<T>>, Size, Q));
44+
T *In = static_cast<T *>(sycl::aligned_alloc_shared(
45+
Flags::template alignment<__ESIMD_DNS::__raw_t<T>>, Size, Q));
4346
for (int i = 0; i < Size; i++) {
4447
In[i] = get_rand<T>();
4548
Out[i] = 0;
@@ -61,20 +64,43 @@ bool test(queue Q, uint32_t Groups, uint32_t Threads) {
6164
simd_mask<1> Mask = GlobalID % 1;
6265
if constexpr (UsePrefetch) {
6366
lsc_prefetch<T, N, DS, L1H, L3H>(In + ElemOffset);
64-
Vals = lsc_block_load<T, N, DS>(In + ElemOffset, Mask, OldValues);
67+
if constexpr (sizeof(T) < 8) {
68+
Vals = lsc_block_load<T, N, DS>(In + ElemOffset, Mask, OldValues,
69+
Flags{});
70+
} else {
71+
Vals = lsc_block_load<T, N, DS>(In + ElemOffset, Mask, OldValues);
72+
}
6573
} else {
66-
Vals = lsc_block_load<T, N, DS, L1H, L3H>(In + ElemOffset, Mask,
67-
OldValues);
74+
if constexpr (sizeof(T) < 8) {
75+
Vals = lsc_block_load<T, N, DS, L1H, L3H>(In + ElemOffset, Mask,
76+
OldValues, Flags{});
77+
} else {
78+
Vals = lsc_block_load<T, N, DS, L1H, L3H>(In + ElemOffset, Mask,
79+
OldValues);
80+
}
6881
}
6982
} else {
7083
if constexpr (UsePrefetch) {
7184
lsc_prefetch<T, N, DS, L1H, L3H>(In + ElemOffset);
72-
Vals = lsc_block_load<T, N, DS>(In + ElemOffset);
85+
if constexpr (sizeof(T) < 8) {
86+
Vals = lsc_block_load<T, N, DS>(In + ElemOffset, Flags{});
87+
} else {
88+
Vals = lsc_block_load<T, N, DS>(In + ElemOffset);
89+
}
7390
} else {
74-
Vals = lsc_block_load<T, N, DS, L1H, L3H>(In + ElemOffset);
91+
if constexpr (sizeof(T) < 8) {
92+
Vals =
93+
lsc_block_load<T, N, DS, L1H, L3H>(In + ElemOffset, Flags{});
94+
} else {
95+
Vals = lsc_block_load<T, N, DS, L1H, L3H>(In + ElemOffset);
96+
}
7597
}
7698
}
77-
lsc_block_store(Out + ElemOffset, Vals);
99+
if constexpr (sizeof(T) < 8) {
100+
lsc_block_store(Out + ElemOffset, Vals, Flags{});
101+
} else {
102+
lsc_block_store(Out + ElemOffset, Vals);
103+
}
78104
}).wait();
79105
} catch (sycl::exception const &e) {
80106
std::cout << "SYCL exception caught: " << e.what() << '\n';
@@ -132,7 +158,25 @@ template <typename T> bool test_lsc_block_load() {
132158
if constexpr (sizeof(T) * 2 >= sizeof(int))
133159
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 5, 5);
134160
if constexpr (sizeof(T) >= sizeof(int))
135-
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, NoCheckMerge>(Q, 3, 5);
161+
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 3, 5);
162+
if constexpr (sizeof(T) <= 4) {
163+
Passed &= test<T, 128, DS, L1H, L3H, NoPrefetch, CheckMerge,
164+
__ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
165+
Passed &= test<T, 128, DS, L1H, L3H, NoPrefetch, NoCheckMerge,
166+
__ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
167+
if constexpr (sizeof(T) == 2) {
168+
Passed &= test<T, 256, DS, L1H, L3H, NoPrefetch, CheckMerge,
169+
__ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
170+
Passed &= test<T, 256, DS, L1H, L3H, NoPrefetch, NoCheckMerge,
171+
__ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
172+
}
173+
if constexpr (sizeof(T) == 1) {
174+
Passed &= test<T, 512, DS, L1H, L3H, NoPrefetch, CheckMerge,
175+
__ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
176+
Passed &= test<T, 512, DS, L1H, L3H, NoPrefetch, NoCheckMerge,
177+
__ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
178+
}
179+
}
136180

137181
Passed &= test<T, 64, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 1, 4);
138182
Passed &= test<T, 32, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 2, 2);
@@ -143,6 +187,16 @@ template <typename T> bool test_lsc_block_load() {
143187
Passed &= test<T, 2, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 5, 5);
144188
if constexpr (sizeof(T) >= sizeof(int))
145189
Passed &= test<T, 1, DS, L1H, L3H, NoPrefetch, CheckMerge>(Q, 3, 5);
190+
// Only 512-bits maximum can be loaded at once (i.e. 4*128 bytes).
191+
if constexpr (sizeof(T) <= 4)
192+
Passed &= test<T, 128, DS, L1H, L3H, NoPrefetch, CheckMerge,
193+
__ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
194+
if constexpr (sizeof(T) <= 2)
195+
Passed &= test<T, 256, DS, L1H, L3H, NoPrefetch, CheckMerge,
196+
__ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
197+
if constexpr (sizeof(T) == 1)
198+
Passed &= test<T, 512, DS, L1H, L3H, NoPrefetch, CheckMerge,
199+
__ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
146200

147201
return Passed;
148202
}

SYCL/ESIMD/lsc/Inputs/lsc_usm_store.hpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ typedef uint32_t Toffset;
2626
template <int case_num, typename T, uint32_t Groups, uint32_t Threads,
2727
uint16_t VL, uint16_t VS, bool transpose,
2828
lsc_data_size DS = lsc_data_size::default_size,
29-
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
29+
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
30+
typename Flags = __ESIMD_NS::overaligned_tag<4>>
3031
bool test(uint32_t pmask = 0xffffffff) {
3132
static_assert((VL == 1) || !transpose, "Transpose must have exec size 1");
3233
if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
@@ -68,7 +69,9 @@ bool test(uint32_t pmask = 0xffffffff) {
6869
sycl::range<1> LocalRange{Threads};
6970
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
7071

71-
T *out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), dev, ctx));
72+
T *out = static_cast<T *>(sycl::aligned_alloc_shared(
73+
Flags::template alignment<__ESIMD_DNS::__raw_t<T>>, Size * sizeof(T), dev,
74+
ctx));
7275
for (int i = 0; i < Size; i++)
7376
out[i] = old_val;
7477

@@ -82,7 +85,12 @@ bool test(uint32_t pmask = 0xffffffff) {
8285

8386
if constexpr (transpose) {
8487
simd<T, VS> vals(new_val + elem_off, 1);
85-
lsc_block_store<T, VS, DS, L1H, L3H>(out + elem_off, vals);
88+
if constexpr (sizeof(T) < 8) {
89+
lsc_block_store<T, VS, DS, L1H, L3H>(out + elem_off, vals,
90+
Flags{});
91+
} else {
92+
lsc_block_store<T, VS, DS, L1H, L3H>(out + elem_off, vals);
93+
}
8694
} else {
8795
simd<Toffset, VL> offset(byte_off, VS * sizeof(T));
8896
simd_mask<VL> pred;

SYCL/ESIMD/lsc/lsc_surf_load_u32.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ template <int TestCastNum, typename T> bool tests() {
3232
passed &= test<TestCastNum + 8, T, 1, 4, 1, 32, true>();
3333
passed &= test<TestCastNum + 9, T, 2, 2, 1, 16, true>();
3434
passed &= test<TestCastNum + 10, T, 4, 4, 1, 4, true>();
35+
36+
// test large number of elements
37+
passed &= test<TestCastNum + 11, T, 2, 1, 1, 128, true,
38+
lsc_data_size::default_size, cache_hint::none,
39+
cache_hint::none, false, __ESIMD_NS::overaligned_tag<8>>(1);
40+
3541
return passed;
3642
}
3743

@@ -40,8 +46,8 @@ int main(void) {
4046
bool passed = true;
4147

4248
passed &= tests<0, uint32_t>();
43-
passed &= tests<11, float>();
44-
passed &= tests<22, sycl::ext::intel::experimental::esimd::tfloat32>();
49+
passed &= tests<12, float>();
50+
passed &= tests<24, sycl::ext::intel::experimental::esimd::tfloat32>();
4551

4652
std::cout << (passed ? "Passed\n" : "FAILED\n");
4753
return passed ? 0 : 1;

SYCL/ESIMD/lsc/lsc_surf_load_u8_u16.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ template <int TestCastNum, typename T> bool tests() {
1919
passed &= test<TestCastNum, T, 1, 4, 1, 32, true>();
2020
passed &= test<TestCastNum + 1, T, 2, 2, 1, 16, true>();
2121
passed &= test<TestCastNum + 2, T, 4, 4, 1, 4, true>();
22+
passed &= test<TestCastNum + 3, T, 4, 4, 1, 128, true,
23+
lsc_data_size::default_size, cache_hint::none,
24+
cache_hint::none, false, __ESIMD_NS::overaligned_tag<8>>();
25+
passed &= test<TestCastNum + 4, T, 4, 4, 1, 256, true,
26+
lsc_data_size::default_size, cache_hint::none,
27+
cache_hint::none, false, __ESIMD_NS::overaligned_tag<8>>();
2228

2329
return passed;
2430
}
@@ -28,9 +34,9 @@ int main(void) {
2834
bool passed = true;
2935

3036
passed &= tests<0, uint8_t>();
31-
passed &= tests<3, uint16_t>();
32-
passed &= tests<6, sycl::ext::oneapi::bfloat16>();
33-
passed &= tests<9, half>();
37+
passed &= tests<5, uint16_t>();
38+
passed &= tests<10, sycl::ext::oneapi::bfloat16>();
39+
passed &= tests<15, half>();
3440

3541
std::cout << (passed ? "Passed\n" : "FAILED\n");
3642
return passed ? 0 : 1;

SYCL/ESIMD/lsc/lsc_surf_store_u32.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ template <int TestCastNum, typename T> bool tests() {
3030
passed &= test<TestCastNum + 8, T, 1, 4, 1, 32, true>();
3131
passed &= test<TestCastNum + 9, T, 2, 2, 1, 16, true>();
3232
passed &= test<TestCastNum + 10, T, 4, 4, 1, 4, true>();
33+
// large number of elements
34+
passed &=
35+
test<TestCastNum + 11, T, 2, 1, 1, 128, true, lsc_data_size::default_size,
36+
cache_hint::none, cache_hint::none, __ESIMD_NS::overaligned_tag<8>>(
37+
1);
3338
return passed;
3439
}
3540

@@ -38,7 +43,7 @@ int main(void) {
3843
bool passed = true;
3944

4045
passed &= tests<0, uint32_t>();
41-
passed &= tests<11, float>();
46+
passed &= tests<12, float>();
4247

4348
std::cout << (passed ? "Passed\n" : "FAILED\n");
4449
return passed ? 0 : 1;

SYCL/ESIMD/lsc/lsc_surf_store_u8_u16.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ template <int TestCastNum, typename T> bool tests() {
1919
passed &= test<TestCastNum, T, 1, 4, 1, 32, true>();
2020
passed &= test<TestCastNum + 1, T, 2, 2, 1, 16, true>();
2121
passed &= test<TestCastNum + 2, T, 4, 4, 1, 4, true>();
22+
passed &= test<TestCastNum + 3, T, 4, 4, 1, 128, true,
23+
lsc_data_size::default_size, cache_hint::none,
24+
cache_hint::none, __ESIMD_NS::overaligned_tag<8>>();
25+
passed &= test<TestCastNum + 4, T, 4, 4, 1, 256, true,
26+
lsc_data_size::default_size, cache_hint::none,
27+
cache_hint::none, __ESIMD_NS::overaligned_tag<8>>();
2228

2329
return passed;
2430
}
@@ -28,7 +34,7 @@ int main(void) {
2834
bool passed = true;
2935

3036
passed &= tests<0, uint8_t>();
31-
passed &= tests<3, uint16_t>();
37+
passed &= tests<5, uint16_t>();
3238

3339
std::cout << (passed ? "Passed\n" : "FAILED\n");
3440
return passed ? 0 : 1;

SYCL/ESIMD/lsc/lsc_usm_store_u32.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
#include "Inputs/lsc_usm_store.hpp"
1414

15-
constexpr uint32_t seed = 288;
15+
constexpr uint32_t seed = 299;
1616
template <int TestCastNum, typename T> bool tests() {
1717
bool passed = true;
1818
// non transpose
@@ -31,6 +31,11 @@ template <int TestCastNum, typename T> bool tests() {
3131
passed &= test<TestCastNum + 9, T, 2, 2, 1, 16, true>();
3232
passed &= test<TestCastNum + 10, T, 4, 4, 1, 4, true>();
3333

34+
// large number of elements
35+
passed &= test<TestCastNum + 11, T, 4, 4, 1, 128, true,
36+
lsc_data_size::default_size, cache_hint::none,
37+
cache_hint::none, __ESIMD_NS::overaligned_tag<8>>();
38+
3439
return passed;
3540
}
3641

@@ -39,7 +44,7 @@ int main(void) {
3944
bool passed = true;
4045

4146
passed &= tests<0, uint32_t>();
42-
passed &= tests<11, float>();
47+
passed &= tests<12, float>();
4348

4449
std::cout << (passed ? "Passed\n" : "FAILED\n");
4550
return passed ? 0 : 1;

0 commit comments

Comments
 (0)