Skip to content

Commit bb811e5

Browse files
authored
Auto pulldown and update tc files for xmain-cand branch
Auto pulldown and update tc files for xmain-cand branch on 20220604
2 parents 8e3bd29 + c4bd9f1 commit bb811e5

15 files changed

+543
-12
lines changed
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#include <CL/sycl.hpp>
2+
#include <sycl/ext/intel/esimd.hpp>
3+
4+
#include <iostream>
5+
6+
using namespace cl::sycl;
7+
using namespace sycl::ext::intel::esimd;
8+
using namespace sycl::ext::intel::experimental::esimd;
9+
10+
// TODO: The SPEC does not say what values are returned for lsc_slm_gather
11+
// when the corresponding elements of the predicate/mask is zero.
12+
// It is assumed to be undefined values there.
13+
// Thus this test does not check those elements now. From the API point of view
14+
// it may be better to have another argument for the values being copied to
15+
// the result when the mask bit is 0.
16+
17+
template <int CaseNum, typename T, uint32_t Groups, uint32_t LocalRange,
18+
uint16_t VL, uint16_t NChannels, bool Transpose,
19+
lsc_data_size DS = lsc_data_size::default_size>
20+
bool test(uint32_t PMask = ~0) {
21+
static_assert((NChannels == 1) || !Transpose,
22+
"Transpose must have exec size 1");
23+
if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
24+
static_assert(!Transpose, "Conversion types may not use vector");
25+
static_assert(NChannels == 1, "Only D32 and D64 support vector load");
26+
}
27+
28+
static_assert(DS != lsc_data_size::u16u32h, "D16U32h not supported in HW");
29+
static_assert(sizeof(T) >= 4,
30+
"D8 and D16 are valid only in 2D block load/store");
31+
32+
if constexpr (!Transpose && NChannels > 1) {
33+
static_assert(VL == 16 || VL == 32,
34+
"IGC prohibits execution size less than SIMD size when "
35+
"vector size is greater than 1");
36+
}
37+
38+
T VMask = static_cast<T>(-1);
39+
if constexpr (DS == lsc_data_size::u8u32)
40+
VMask = static_cast<T>(0xff);
41+
else if constexpr (DS == lsc_data_size::u16u32)
42+
VMask = static_cast<T>(0xffff);
43+
else if constexpr (DS == lsc_data_size::u16u32h)
44+
VMask = static_cast<T>(0xffff0000);
45+
46+
queue Q(gpu_selector{});
47+
auto D = Q.get_device();
48+
std::cout << "Running case #" << CaseNum << " on "
49+
<< D.get_info<info::device::name>() << std::endl;
50+
51+
nd_range<1> Range{range<1>{Groups * LocalRange}, range<1>{LocalRange}};
52+
constexpr uint16_t OutSize = Groups * LocalRange * VL * NChannels;
53+
T *Out = malloc_shared<T>(OutSize, Q);
54+
memset(Out, 0, OutSize * sizeof(T));
55+
56+
try {
57+
Q.submit([&](handler &cgh) {
58+
cgh.parallel_for(Range, [=](sycl::nd_item<1> NDId) SYCL_ESIMD_KERNEL {
59+
uint32_t GID = NDId.get_global_id(0);
60+
uint32_t LID = NDId.get_local_id(0);
61+
uint32_t GroupID = NDId.get_group_linear_id();
62+
63+
// Allocate and init 128-byte multiple size SLM memory with
64+
// consequential values. i-th group gets values:
65+
// {0, 1, 2, ...} + GroupID * 1000000.
66+
constexpr uint32_t ResultSIMDByteSize = VL * NChannels * sizeof(T);
67+
constexpr uint32_t SLMSize =
68+
(ResultSIMDByteSize * LocalRange + 127) & ~127;
69+
slm_init(SLMSize);
70+
if (NDId.get_local_id(0) == 0) {
71+
simd<T, 4> Vals(GroupID * 1000000, 1);
72+
for (int I = 0; I < SLMSize; I += 4 * sizeof(T)) {
73+
slm_block_store<T, 4>(I, Vals);
74+
Vals += 4;
75+
}
76+
}
77+
barrier();
78+
79+
if constexpr (Transpose) {
80+
auto Vals = lsc_slm_block_load<T, VL, DS>(LID * VL * sizeof(T));
81+
Vals.copy_to(Out + GID * VL);
82+
} else {
83+
simd<uint32_t, VL> Offsets(LID * VL * NChannels * sizeof(T),
84+
NChannels * sizeof(T));
85+
86+
// Create the predicate for the gather from 'PMask'.
87+
simd_mask<VL> Pred;
88+
for (int I = 0; I < VL; I++)
89+
Pred.template select<1, 1>(I) = (PMask >> I) & 1;
90+
91+
simd<T, VL *NChannels> Vals =
92+
lsc_slm_gather<T, NChannels, DS>(Offsets, Pred);
93+
94+
Vals.copy_to(Out + GID * VL * NChannels);
95+
}
96+
});
97+
}).wait();
98+
} catch (sycl::exception const &e) {
99+
std::cout << "SYCL exception caught: " << e.what() << '\n';
100+
sycl::free(Out, Q);
101+
return false;
102+
}
103+
104+
bool Passed = true;
105+
106+
if constexpr (Transpose) {
107+
for (uint32_t I = 0; I < OutSize; I++) {
108+
uint32_t GroupId = I / (LocalRange * VL * NChannels);
109+
uint32_t LID = I % (LocalRange * VL * NChannels);
110+
T ExpectedVal = GroupId * 1000000 + LID;
111+
if (Out[I] != ExpectedVal) {
112+
Passed = false;
113+
std::cout << I << ": Value = " << Out[I]
114+
<< ", Expected value = " << ExpectedVal << std::endl;
115+
}
116+
}
117+
} else {
118+
for (uint32_t I = 0; I < OutSize; I += VL * NChannels) {
119+
uint32_t GroupId = I / (LocalRange * VL * NChannels);
120+
uint32_t LID = I % (LocalRange * VL * NChannels);
121+
T ExpectedValBase = GroupId * 1000000 + LID;
122+
for (int ChannelId = 0; ChannelId < NChannels; ChannelId++) {
123+
for (int J = 0; J < VL; J++) {
124+
uint32_t OutIndex = I + ChannelId * VL + J;
125+
126+
if (((PMask >> J) & 1) == 0)
127+
continue;
128+
T ExpectedVal = (ExpectedValBase + ChannelId + J * NChannels) & VMask;
129+
if (Out[OutIndex] != ExpectedVal) {
130+
Passed = false;
131+
std::cout << OutIndex << ": Value = " << Out[OutIndex]
132+
<< ", Expected value = " << ExpectedVal << std::endl;
133+
}
134+
}
135+
}
136+
}
137+
}
138+
139+
sycl::free(Out, Q);
140+
141+
if (!Passed)
142+
std::cout << "Case #" << CaseNum << " FAILED" << std::endl;
143+
return Passed;
144+
}
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#include <CL/sycl.hpp>
2+
#include <sycl/ext/intel/esimd.hpp>
3+
4+
#include <iostream>
5+
6+
using namespace cl::sycl;
7+
using namespace sycl::ext::intel::esimd;
8+
using namespace sycl::ext::intel::experimental::esimd;
9+
10+
template <int CaseNum, typename T, uint32_t Groups, uint32_t LocalRange,
11+
uint16_t VL, uint16_t NChannels, bool Transpose,
12+
lsc_data_size DS = lsc_data_size::default_size>
13+
bool test(uint32_t PMask = ~0) {
14+
static_assert((NChannels == 1) || !Transpose,
15+
"Transpose must have exec size 1");
16+
if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) {
17+
static_assert(!Transpose, "Conversion types may not use vector");
18+
static_assert(NChannels == 1, "Only D32 and D64 support vector load");
19+
}
20+
21+
static_assert(DS != lsc_data_size::u16u32h, "D16U32h not supported in HW");
22+
static_assert(sizeof(T) >= 4,
23+
"D8 and D16 are valid only in 2D block load/store");
24+
25+
if constexpr (!Transpose && NChannels > 1) {
26+
static_assert(VL == 16 || VL == 32,
27+
"IGC prohibits execution size less than SIMD size when "
28+
"vector size is greater than 1");
29+
}
30+
31+
T VMask = static_cast<T>(-1);
32+
if constexpr (DS == lsc_data_size::u8u32)
33+
VMask = static_cast<T>(0xff);
34+
else if constexpr (DS == lsc_data_size::u16u32)
35+
VMask = static_cast<T>(0xffff);
36+
else if constexpr (DS == lsc_data_size::u16u32h)
37+
VMask = static_cast<T>(0xffff0000);
38+
39+
queue Q(gpu_selector{});
40+
auto D = Q.get_device();
41+
std::cout << "Running case #" << CaseNum << " on "
42+
<< D.get_info<info::device::name>() << std::endl;
43+
44+
nd_range<1> Range{range<1>{Groups * LocalRange}, range<1>{LocalRange}};
45+
constexpr uint16_t OutSize = Groups * LocalRange * VL * NChannels;
46+
T *Out = malloc_shared<T>(OutSize, Q);
47+
memset(Out, 0, OutSize * sizeof(T));
48+
49+
try {
50+
Q.submit([&](handler &cgh) {
51+
cgh.parallel_for(Range, [=](sycl::nd_item<1> NDId) SYCL_ESIMD_KERNEL {
52+
uint32_t GID = NDId.get_global_id(0);
53+
uint32_t LID = NDId.get_local_id(0);
54+
uint32_t GroupID = NDId.get_group_linear_id();
55+
56+
// 1. Allocate and init 128-byte multiple size SLM memory with special
57+
// values.
58+
constexpr uint32_t ResultSIMDByteSize = VL * NChannels * sizeof(T);
59+
constexpr uint32_t SLMSize =
60+
(ResultSIMDByteSize * LocalRange + 127) & ~127;
61+
slm_init(SLMSize);
62+
if (NDId.get_local_id(0) == 0) {
63+
simd<T, 4> Vals = static_cast<T>(0xBAADF00DBAADF00D);
64+
for (int I = 0; I < SLMSize; I += 4 * sizeof(T))
65+
slm_block_store<T, 4>(I, Vals);
66+
}
67+
barrier();
68+
69+
// 2. Use STORE intrinscis that are being verified in this test.
70+
if constexpr (Transpose) {
71+
simd<T, VL> Vals(GroupID * 1000000 + LID * 1000, 1);
72+
lsc_slm_block_store<T, VL, DS>(LID * VL * sizeof(T), Vals);
73+
} else {
74+
75+
// Create the predicate for the gather from 'PMask'.
76+
simd_mask<VL> Pred;
77+
for (int I = 0; I < VL; I++)
78+
Pred.template select<1, 1>(I) = (PMask >> I) & 1;
79+
80+
simd<T, VL * NChannels> Vals(GroupID * 1000000 + LID * 1000, 1);
81+
simd<uint32_t, VL> Offsets(LID * VL * NChannels * sizeof(T),
82+
NChannels * sizeof(T));
83+
lsc_slm_scatter<T, NChannels, DS>(Offsets, Vals, Pred);
84+
}
85+
barrier();
86+
87+
// 3. Simply load the content of SLM and store it to USM.
88+
if (NDId.get_local_id(0) == 0) {
89+
int End = LocalRange * VL * NChannels;
90+
for (int I = 0; I < End; I += 4) {
91+
auto Vals = slm_block_load<T, 4>(I * sizeof(T));
92+
93+
// If 'VL' is small, simd<T, 4> cannot be safely used
94+
if (I + 4 > End) {
95+
for (int J = 0; J + I < End; J++)
96+
Out[GroupID * LocalRange * VL * NChannels + I + J] =
97+
(T)Vals[J];
98+
} else {
99+
Vals.copy_to(Out + GroupID * LocalRange * VL * NChannels + I);
100+
}
101+
}
102+
}
103+
});
104+
}).wait();
105+
} catch (sycl::exception const &e) {
106+
std::cout << "SYCL exception caught: " << e.what() << '\n';
107+
sycl::free(Out, Q);
108+
return false;
109+
}
110+
111+
bool Passed = true;
112+
113+
if constexpr (Transpose) {
114+
for (uint32_t I = 0; I < OutSize; I++) {
115+
uint32_t GroupId = I / (LocalRange * VL);
116+
uint32_t LID = I / VL % LocalRange;
117+
T ExpectedVal = GroupId * 1000000 + LID * 1000 + I % VL;
118+
if (Out[I] != ExpectedVal) {
119+
Passed = false;
120+
std::cout << I << ": Value = " << Out[I]
121+
<< ", Expected value = " << ExpectedVal << std::endl;
122+
}
123+
}
124+
} else {
125+
for (uint32_t I = 0; I < OutSize; I += VL * NChannels) {
126+
uint32_t GroupId = I / (LocalRange * VL * NChannels);
127+
uint32_t LID = I / (VL * NChannels) % LocalRange;
128+
T ExpectedValBase = GroupId * 1000000 + LID * 1000 + I % (VL * NChannels);
129+
T ExpectedValInc = 0;
130+
uint32_t MaskIndex = 0;
131+
uint32_t MaskIndexTimer = 0;
132+
for (int ChannelId = 0; ChannelId < NChannels; ChannelId++) {
133+
for (int J = 0; J < VL; J++) {
134+
uint32_t OutIndex = I + ChannelId * VL + J;
135+
T ExpectedVal = ((PMask >> MaskIndex) & 1)
136+
? (ExpectedValBase + ExpectedValInc)
137+
: static_cast<T>(0xBAADF00DBAADF00D);
138+
ExpectedVal &= VMask;
139+
MaskIndexTimer++;
140+
if (MaskIndexTimer >= NChannels) {
141+
MaskIndexTimer = 0;
142+
MaskIndex++;
143+
}
144+
145+
ExpectedValInc += VL;
146+
if (ExpectedValInc >= VL * NChannels)
147+
ExpectedValInc = (ExpectedValInc % (VL * NChannels)) + 1;
148+
149+
T OutVal = Out[OutIndex] & VMask;
150+
if (OutVal != ExpectedVal) {
151+
Passed = false;
152+
std::cout << OutIndex << ": Value = " << Out[OutIndex]
153+
<< ", Expected value = " << ExpectedVal << std::endl;
154+
}
155+
}
156+
}
157+
}
158+
}
159+
160+
sycl::free(Out, Q);
161+
162+
if (!Passed)
163+
std::cout << "Case #" << CaseNum << " FAILED" << std::endl;
164+
return Passed;
165+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
//==------------ lsc_block_store_u64.cpp - DPC++ ESIMD on-device test ------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu-intel-pvc
9+
// UNSUPPORTED: cuda || hip
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
13+
#include "Inputs/lsc_block_store.hpp"
14+
15+
constexpr uint32_t seed = 363;
16+
using T = uint64_t;
17+
18+
int main(void) {
19+
srand(seed);
20+
bool passed = true;
21+
22+
passed &= test<1, T, 1, 1, 8, 8>(11, 20, 14, 3, 11);
23+
passed &= test<2, T, 2, 2, 2, 2>(3, 3, 8, 1, 1);
24+
25+
std::cout << (passed ? "Passed\n" : "FAILED\n");
26+
return passed ? 0 : 1;
27+
}

SYCL/ESIMD/lsc/lsc_slm_block_load.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// REQUIRES: gpu-intel-pvc
2+
// UNSUPPORTED: cuda || hip
3+
// RUN: %clangxx -fsycl %s -o %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
6+
// This test verifies the correctness of LSC intrinsics loading
7+
// from SLM memory.
8+
9+
#include "Inputs/lsc_slm_load.hpp"
10+
11+
// This test verifies the correctness of LSC SLM block load intrinsics.
12+
13+
// Id - test id.
14+
// NGroups - number of work groups.
15+
// LocalSize - number work items in each work group.
16+
// VL - number of offsets used in the gather operation.
17+
template <int Id, int NGroups, int LocalSize, int VL> bool test_load() {
18+
bool Passed = true;
19+
Passed &= test<Id, uint32_t, NGroups, LocalSize, VL, 1, true>();
20+
Passed &= test<Id + 1, uint64_t, NGroups, LocalSize, VL, 1, true>();
21+
return Passed;
22+
}
23+
24+
int main() {
25+
bool Passed = true;
26+
27+
// test_load<Id, NGroups, LocalSize, VL>();
28+
Passed &= test_load<0, 1, 1, 4>();
29+
Passed &= test_load<2, 1, 7, 16>();
30+
Passed &= test_load<4, 4, 7, 16>();
31+
Passed &= test_load<6, 16, 8, 8>();
32+
Passed &= test_load<8, 2, 4, 32>();
33+
Passed &= test_load<10, 2, 4, 64>();
34+
35+
std::cout << (Passed ? "Passed" : "FAILED") << std::endl;
36+
return Passed ? 0 : 1;
37+
}

0 commit comments

Comments
 (0)