Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 1d1174e

Browse files
authored
[SYCL][ESIMD] Fix kernel bug in BitonicSortKv2 and Kmeans (#75)
Fix kernel bug in BitonicSortKv2 and Kmeans, add split barrier and dp4a tests.
1 parent 991d47b commit 1d1174e

File tree

4 files changed

+270
-3
lines changed

4 files changed

+270
-3
lines changed

SYCL/ESIMD/BitonicSortKv2.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -599,8 +599,11 @@ int main(int argc, char *argv[]) {
599599
int size = 1 << LOG2_ELEMENTS;
600600
cout << "BitonicSort (" << size << ") Start..." << std::endl;
601601

602+
cl::sycl::property_list props{property::queue::enable_profiling{},
603+
property::queue::in_order()};
604+
602605
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
603-
property::queue::enable_profiling{});
606+
props);
604607

605608
BitonicSort bitonicSort;
606609

SYCL/ESIMD/dp4a.cpp

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
//==------------------ dp4a.cpp - DPC++ ESIMD on-device test --------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// TODO enable on Windows
9+
// REQUIRES: linux && gpu
10+
// RUN: %clangxx-esimd -fsycl %s -o %t.out
11+
// RUN: %ESIMD_RUN_PLACEHOLDER %t.out
12+
// TODO : Enable test for new GPU device
13+
// XFAIL: *
14+
15+
#include "esimd_test_utils.hpp"
16+
17+
#include <CL/sycl.hpp>
18+
#include <CL/sycl/INTEL/esimd.hpp>
19+
#include <iostream>
20+
21+
using namespace cl::sycl;
22+
23+
int main(void) {
24+
constexpr unsigned SIZE = 16;
25+
constexpr unsigned GROUPSIZE = 1;
26+
using DTYPE = unsigned int;
27+
28+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
29+
30+
auto dev = q.get_device();
31+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
32+
auto ctxt = q.get_context();
33+
34+
DTYPE *S0 =
35+
static_cast<DTYPE *>(malloc_shared(SIZE * sizeof(DTYPE), dev, ctxt));
36+
DTYPE *S1 =
37+
static_cast<DTYPE *>(malloc_shared(SIZE * sizeof(DTYPE), dev, ctxt));
38+
DTYPE *S2 =
39+
static_cast<DTYPE *>(malloc_shared(SIZE * sizeof(DTYPE), dev, ctxt));
40+
41+
DTYPE *RES =
42+
static_cast<DTYPE *>(malloc_shared(SIZE * sizeof(DTYPE), dev, ctxt));
43+
44+
for (unsigned i = 0; i < SIZE; ++i) {
45+
S0[i] = 0x32;
46+
S1[i] = 0x0102037F;
47+
S2[i] = 0x0102037F;
48+
RES[i] = 0;
49+
}
50+
51+
cl::sycl::range<1> GroupRange{1};
52+
53+
cl::sycl::range<1> TaskRange{GROUPSIZE};
54+
cl::sycl::nd_range<1> Range(GroupRange, TaskRange);
55+
56+
try {
57+
auto e = q.submit([&](handler &cgh) {
58+
cgh.parallel_for<class Test>(
59+
Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
60+
using namespace sycl::INTEL::gpu;
61+
62+
simd<DTYPE, SIZE> src0(0);
63+
src0 = block_load<DTYPE, SIZE>(S0);
64+
65+
simd<DTYPE, SIZE> src1(0);
66+
src1 = block_load<DTYPE, SIZE>(S1);
67+
68+
simd<DTYPE, SIZE> src2(0);
69+
src2 = block_load<DTYPE, SIZE>(S2);
70+
71+
auto res =
72+
esimd_dp4a<DTYPE, DTYPE, DTYPE, DTYPE, SIZE>(src0, src1, src2);
73+
block_store<DTYPE, SIZE>(RES, res);
74+
});
75+
});
76+
e.wait();
77+
} catch (cl::sycl::exception const &e) {
78+
std::cout << "SYCL exception caught: " << e.what() << '\n';
79+
return e.get_cl_code();
80+
}
81+
82+
int err_cnt = 0;
83+
for (unsigned i = 0; i < SIZE; ++i) {
84+
if (RES[i] != 0x3F41) {
85+
if (++err_cnt < 10) {
86+
std::cout << "failed at index " << i << ", " << RES[i]
87+
<< " != " << 0x3F41 << "\n";
88+
}
89+
}
90+
}
91+
92+
std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
93+
return err_cnt > 0 ? 1 : 0;
94+
}

SYCL/ESIMD/kmeans/kmeans.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -548,8 +548,12 @@ int main(int argc, char *argv[]) {
548548
std::cerr << "Usage: kmeans.exe input_file" << std::endl;
549549
exit(1);
550550
}
551+
552+
cl::sycl::property_list props{property::queue::enable_profiling{},
553+
property::queue::in_order()};
551554
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
552-
property::queue::enable_profiling{});
555+
props);
556+
553557
auto dev = q.get_device();
554558
auto ctxt = q.get_context();
555559

@@ -662,7 +666,7 @@ int main(int argc, char *argv[]) {
662666
cmk_accum_reduction((uint *)accum, i);
663667
});
664668
});
665-
e.wait();
669+
e1.wait();
666670
kernel2_time_in_ns += report_time("kernel2", e1);
667671
#endif
668672

SYCL/ESIMD/slm_split_barrier.cpp

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
//==----------- slm_split_barrier.cpp - DPC++ ESIMD on-device test --------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// TODO enable on Windows
9+
// REQUIRES: linux && gpu
10+
// RUN: %clangxx-esimd -fsycl %s -o %t.out
11+
// RUN: %ESIMD_RUN_PLACEHOLDER %t.out
12+
13+
#include "esimd_test_utils.hpp"
14+
15+
#include <CL/sycl.hpp>
16+
#include <CL/sycl/INTEL/esimd.hpp>
17+
#include <iostream>
18+
19+
using namespace cl::sycl;
20+
using namespace sycl::INTEL::gpu;
21+
22+
#define LOCAL_SIZE 4
23+
#define GLOBAL_SIZE 6
24+
#define NUM_THREADS LOCAL_SIZE *GLOBAL_SIZE
25+
26+
/// \brief transfer data from memory to SLM.
27+
///
28+
/// Load ::size bytes from memory pointer ::addr starting at ::offset to the
29+
/// SLM ::slmOffset. ::size must be a multiple of 256.
30+
///
31+
ESIMD_INLINE
32+
void load_to_slm(uint grpSize, uint localId, uint slmOffset, char *addr,
33+
uint offset, uint size) {
34+
simd<uint, 16> vOffset(0, 16);
35+
36+
uint numTotalBlocks = size / 256;
37+
uint numBlocks = numTotalBlocks / grpSize;
38+
uint numLeftOver = numTotalBlocks % grpSize;
39+
numBlocks += (localId < numLeftOver) ? 1 : 0;
40+
41+
uint threadOffsetInSLM = slmOffset + localId * 256;
42+
// in bytes
43+
uint threadOffsetInMemory = offset + threadOffsetInSLM;
44+
// in unit of bytes
45+
simd<uint, 16> vOffsets = vOffset + threadOffsetInSLM;
46+
47+
for (uint block = 0; block < numBlocks; block++) {
48+
simd<uint, 32> row0; // 32 floats or 128 Bytes or 4 GRF-registers
49+
simd<uint, 32> row1;
50+
simd<uint, 64> rowTrans;
51+
row0 = block_load<uint, 32>((const uint *)(addr + threadOffsetInMemory));
52+
row1 =
53+
block_load<uint, 32>((const uint *)(addr + threadOffsetInMemory + 128));
54+
55+
// Transpose
56+
rowTrans.select<8, 1>(0) = row0.select<8, 4>(0);
57+
rowTrans.select<8, 1>(16) = row0.select<8, 4>(1);
58+
rowTrans.select<8, 1>(32) = row0.select<8, 4>(2);
59+
rowTrans.select<8, 1>(48) = row0.select<8, 4>(3);
60+
61+
rowTrans.select<8, 1>(8) = row1.select<8, 4>(0);
62+
rowTrans.select<8, 1>(24) = row1.select<8, 4>(1);
63+
rowTrans.select<8, 1>(40) = row1.select<8, 4>(2);
64+
rowTrans.select<8, 1>(56) = row1.select<8, 4>(3);
65+
66+
slm_store4<uint, 16, ESIMD_ABGR_ENABLE>(rowTrans, vOffsets);
67+
threadOffsetInMemory += grpSize * 256;
68+
vOffsets += (grpSize * 256);
69+
}
70+
71+
esimd_fence(ESIMD_GLOBAL_COHERENT_FENCE);
72+
esimd_sbarrier(ESIMD_SBARRIER_SIGNAL);
73+
esimd_sbarrier(ESIMD_SBARRIER_WAIT);
74+
}
75+
76+
int main(void) {
77+
constexpr unsigned VL = 16;
78+
constexpr unsigned Size = NUM_THREADS * VL;
79+
80+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
81+
82+
auto dev = q.get_device();
83+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
84+
auto ctxt = q.get_context();
85+
// TODO: release memory in the end of the test
86+
uint *A = static_cast<uint *>(malloc_shared(Size * sizeof(uint), dev, ctxt));
87+
uint *B = static_cast<uint *>(malloc_shared(Size * sizeof(uint), dev, ctxt));
88+
89+
// Checking with specific inputs
90+
for (int i = 0; i < NUM_THREADS; i++) {
91+
uint *A_int = (uint *)(A + i * VL);
92+
for (int j = 0; j < VL; j++) {
93+
A_int[j] = i + j;
94+
std::cout << A_int[j] << " ";
95+
}
96+
std::cout << std::endl;
97+
}
98+
99+
// We need that many workitems
100+
cl::sycl::range<1> GlobalRange{GLOBAL_SIZE};
101+
102+
// Number of workitems in a workgroup
103+
cl::sycl::range<1> LocalRange{LOCAL_SIZE};
104+
cl::sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
105+
106+
try {
107+
auto e = q.submit([&](handler &cgh) {
108+
cgh.parallel_for<class Test>(
109+
Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
110+
simd<uint, VL> v_slmData;
111+
simd<uint, VL> v_Off(0, 4);
112+
113+
uint localID = ndi.get_local_id(0);
114+
uint groupSize = ndi.get_local_range(0);
115+
uint globalID = ndi.get_global_id(0);
116+
uint groupID = ndi.get_group(0);
117+
118+
slm_init(1024);
119+
120+
int grpMemOffset = groupID * groupSize * VL * 4;
121+
122+
load_to_slm(groupSize, localID, 0, (char *)A, grpMemOffset,
123+
groupSize * VL * 4);
124+
125+
auto shiftID = (localID + 1) % 4;
126+
127+
v_Off = v_Off + shiftID * 64;
128+
129+
v_slmData = slm_load<uint, VL>(v_Off);
130+
131+
block_store<uint, VL>(B + globalID * VL, v_slmData);
132+
});
133+
});
134+
e.wait();
135+
} catch (cl::sycl::exception const &e) {
136+
std::cout << "SYCL exception caught: " << e.what() << '\n';
137+
return e.get_cl_code();
138+
}
139+
140+
std::cout << "result" << std::endl;
141+
int result = 0;
142+
for (int i = 0; i < NUM_THREADS; i++) {
143+
unsigned int *p = (unsigned int *)(B + i * VL);
144+
if ((i % 4) != 3) {
145+
for (int j = 0; j < VL; j++) {
146+
std::cout << (*p) << " ";
147+
if (*p != (i + 1 + j)) {
148+
result = -1;
149+
}
150+
p++;
151+
}
152+
} else {
153+
for (int j = 0; j < VL; j++) {
154+
std::cout << (*p) << " ";
155+
if (*p != (i - 3 + j)) {
156+
result = -1;
157+
}
158+
p++;
159+
}
160+
}
161+
std::cout << std::endl;
162+
}
163+
164+
std::cout << (result < 0 ? "FAILED\n" : "Passed\n");
165+
return result < 0 ? 1 : 0;
166+
}

0 commit comments

Comments
 (0)