Skip to content

Commit 004dc3e

Browse files
v-klochkovbb-sycl
authored andcommitted
[ESIMD] Add more tests for new xmx::dpas() (intel#1291)
* Add the test for bfloat16; * Add the tests for dg2; * Run DG2 and PVC tests on esimd_emulator; Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 702bade commit 004dc3e

File tree

4 files changed

+123
-32
lines changed

4 files changed

+123
-32
lines changed

SYCL/ESIMD/dpas/dpas_bf16.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
//==---------------- dpas_bf16.cpp - DPC++ ESIMD on-device test ----------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 || esimd_emulator
9+
// UNSUPPORTED: cuda || hip
10+
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
13+
// This test verifies DPAS support for bfloat16.
14+
15+
#include "dpas_common.hpp"
16+
17+
int main(int argc, const char *argv[]) {
18+
queue Q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
19+
20+
bool Print = argc > 1 && std::string(argv[1]) == "-debug";
21+
bool Passed = true;
22+
23+
constexpr bool LetDeduceArgs = true;
24+
Passed &= tests<8, 8, bf16, bf16, LetDeduceArgs>(Q, Print);
25+
Passed &= tests<8, 4, bf16, bf16, LetDeduceArgs>(Q, Print);
26+
Passed &= tests<8, 1, bf16, bf16, LetDeduceArgs>(Q, Print);
27+
28+
// TODO: Enable these cases when esimd::simd(ptr) constructor is fixed.
29+
// Passed &= tests<8, 5, bf16, bf16, LetDeduceArgs>(Q, Print);
30+
// Passed &= tests<8, 3, bf16, bf16, LetDeduceArgs>(Q, Print);
31+
32+
std::cout << (Passed ? "Test Passed\n" : "Test FAILED\n");
33+
return Passed ? 0 : 1;
34+
}

SYCL/ESIMD/dpas/dpas_common.hpp

Lines changed: 75 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ std::string toString(dpas_argument_type T) {
5050
return "bf16";
5151
case dpas_argument_type::tf32:
5252
return "tf32";
53-
case dpas_argument_type::S1:
54-
case dpas_argument_type::U1:
53+
case dpas_argument_type::s1:
54+
case dpas_argument_type::u1:
5555
case dpas_argument_type::Invalid:
5656
return "UNSUPPORTED";
5757
}
@@ -65,7 +65,7 @@ template <dpas_argument_type T> struct DpasPrintType {
6565
static constexpr bool is_uint = T == dpas_argument_type::u2 ||
6666
T == dpas_argument_type::u4 ||
6767
T == dpas_argument_type::u8;
68-
static constexpr bool is_fp = T == dpas_argument_type::FP16 ||
68+
static constexpr bool is_fp = T == dpas_argument_type::fp16 ||
6969
T == dpas_argument_type::bf16 ||
7070
T == dpas_argument_type::tf32;
7171

@@ -100,7 +100,7 @@ template <dpas_argument_type T> struct DpasNaturalOperandType {
100100
is_uint, unsigned char,
101101
std::conditional_t<
102102
is_fp16, sycl::half,
103-
std::conditional<
103+
std::conditional_t<
104104
is_bf16, sycl::ext::oneapi::experimental::bfloat16, void>>>>;
105105
};
106106

@@ -123,6 +123,11 @@ template <dpas_argument_type T> constexpr int getBitSize() {
123123

124124
case dpas_argument_type::tf32:
125125
return 32;
126+
127+
case dpas_argument_type::Invalid:
128+
case dpas_argument_type::s1:
129+
case dpas_argument_type::u1:
130+
break;
126131
}
127132
return 0;
128133
}
@@ -282,7 +287,8 @@ void printMatrix(void *Vec, std::string Msg) {
282287
}
283288

284289
template <int SystolicDepth, int RepeatCount, dpas_argument_type BPrec,
285-
dpas_argument_type APrec, bool UseSrc0>
290+
dpas_argument_type APrec, bool UseSrc0, int ExecSize,
291+
bool LetDeduceArgs>
286292
bool test(queue &Q, bool Print) {
287293
constexpr unsigned Size = 128;
288294
constexpr unsigned VL = 16;
@@ -300,12 +306,13 @@ bool test(queue &Q, bool Print) {
300306
// where:
301307
constexpr int M = RepeatCount;
302308
constexpr int K = SystolicDepth * OpsPerChannel;
303-
constexpr int N = 16; // Execution size: 16 for PVC.
309+
constexpr int N = ExecSize; // 16 for PVC, 8 for DG2.
304310

305311
auto Dev = Q.get_device();
306-
std::cout << "Running test case " << toString(BPrec, APrec)
307-
<< " with UseSrc0 = " << UseSrc0 << " on "
308-
<< Dev.get_info<info::device::name>() << "\n";
312+
std::cout << "Running on " << Dev.get_info<info::device::name>()
313+
<< " (ExecSize = " << ExecSize << "): " << toString(BPrec, APrec)
314+
<< ", UseSrc0 = " << UseSrc0
315+
<< ", LetDeduceArgs = " << LetDeduceArgs << std::endl;
309316

310317
using ANaturalType = typename DpasNaturalOperandType<APrec>::type;
311318
using BNaturalType = typename DpasNaturalOperandType<BPrec>::type;
@@ -317,6 +324,7 @@ bool test(queue &Q, bool Print) {
317324
auto BPacked = aligned_alloc_shared<BNaturalType>(128, BPackedSize, Q);
318325
auto Res = aligned_alloc_shared<ResNaturalType>(128, M * N, Q);
319326
// Init APacked;
327+
<<<<<<< HEAD
320328
<<<<<<< HEAD
321329
int Value = 1;
322330
for (int II = 0; II < M; II++) {
@@ -328,6 +336,12 @@ bool test(queue &Q, bool Print) {
328336
for (int JJ = 0; JJ < K; JJ++) {
329337
Value++;
330338
>>>>>>> 78be3ae16 ([ESIMD] Add tests for new esimd::xmx:dpas API (#1281))
339+
=======
340+
float Value = 1.2;
341+
for (int II = 0; II < M; II++) {
342+
for (int JJ = 0; JJ < K; JJ++) {
343+
Value += 1.1;
344+
>>>>>>> 7fc11d5ad ([ESIMD] Add more tests for new xmx::dpas() (#1291))
331345
writeToHorizontallyPackedMatrix<M, K, APrec>(
332346
APacked, II, JJ, static_cast<ANaturalType>(Value));
333347
}
@@ -357,15 +371,27 @@ bool test(queue &Q, bool Print) {
357371
simd<BNaturalType, BPackedSize> B(BPacked, overaligned_tag<16>{});
358372
simd<ResNaturalType, M * N> C;
359373

360-
if constexpr (UseSrc0) {
361-
// Compute C = C + AxB;
362-
C = 1;
363-
C = dpas<8, RepeatCount, ResNaturalType, ResNaturalType, BNaturalType,
364-
ANaturalType, BPrec, APrec>(C, B, A);
374+
if constexpr (LetDeduceArgs) {
375+
if constexpr (UseSrc0) {
376+
// Compute C = C + AxB;
377+
C = 1;
378+
C = dpas<8, RepeatCount, ResNaturalType>(C, B, A);
379+
} else {
380+
// Compute C = AxB;
381+
C = dpas<8, RepeatCount, ResNaturalType>(B, A);
382+
}
383+
365384
} else {
366-
// Compute C = AxB;
367-
C = dpas<8, RepeatCount, ResNaturalType, BNaturalType, ANaturalType,
368-
BPrec, APrec>(B, A);
385+
if constexpr (UseSrc0) {
386+
// Compute C = C + AxB;
387+
C = 1;
388+
C = dpas<8, RepeatCount, ResNaturalType, ResNaturalType, BNaturalType,
389+
ANaturalType, BPrec, APrec>(C, B, A);
390+
} else {
391+
// Compute C = AxB;
392+
C = dpas<8, RepeatCount, ResNaturalType, BNaturalType, ANaturalType,
393+
BPrec, APrec>(B, A);
394+
}
369395
}
370396

371397
C.copy_to(Res);
@@ -408,11 +434,40 @@ bool test(queue &Q, bool Print) {
408434
}
409435

410436
template <int SystolicDepth, int RepeatCount, dpas_argument_type T1,
411-
dpas_argument_type T2>
437+
dpas_argument_type T2, bool LetDeduceArgs = false>
412438
bool tests(queue Q, bool Print) {
413439
bool Passed = true;
414440
constexpr bool UseSrc0 = true;
415-
Passed &= test<SystolicDepth, RepeatCount, T1, T2, UseSrc0>(Q, Print);
416-
Passed &= test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0>(Q, Print);
441+
auto Dev = Q.get_device();
442+
443+
// Detect the execution size.
444+
// The device trait is not implemented for esimd_emulator. Use both 8 and 16.
445+
int ExecSize;
446+
bool IsEmulator = false;
447+
try {
448+
ExecSize = Dev.get_info<ext::intel::info::device::gpu_eu_simd_width>();
449+
} catch (sycl::exception e) {
450+
IsEmulator = true;
451+
}
452+
assert((IsEmulator || (ExecSize == 8 || ExecSize == 16)) &&
453+
"Execution size must be 8 or 16");
454+
455+
if (ExecSize == 16 || IsEmulator) {
456+
Passed &=
457+
test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 16, LetDeduceArgs>(
458+
Q, Print);
459+
Passed &=
460+
test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0, 16, LetDeduceArgs>(
461+
Q, Print);
462+
}
463+
if (ExecSize == 8 || IsEmulator) {
464+
Passed &=
465+
test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 8, LetDeduceArgs>(
466+
Q, Print);
467+
Passed &=
468+
test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0, 8, LetDeduceArgs>(
469+
Q, Print);
470+
}
471+
417472
return Passed;
418473
}

SYCL/ESIMD/dpas/dpas_fp16.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
77
//===----------------------------------------------------------------------===//
8-
// REQUIRES: gpu-intel-pvc || esimd_emulator
8+
// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 || esimd_emulator
99
// UNSUPPORTED: cuda || hip
10-
// RUN: %clangxx -fsycl %s -o %t.out
10+
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel %s -o %t.out
1111
// RUN: %GPU_RUN_PLACEHOLDER %t.out
1212

1313
// This test verifies DPAS support for float16.
@@ -20,14 +20,14 @@ int main(int argc, const char *argv[]) {
2020
bool Print = argc > 1 && std::string(argv[1]) == "-debug";
2121
bool Passed = true;
2222

23-
// Test unsigned 2-bit integers./
24-
Passed &= tests<8, 8, fp16, fp16>(Q, Print);
25-
Passed &= tests<8, 4, fp16, fp16>(Q, Print);
26-
Passed &= tests<8, 1, fp16, fp16>(Q, Print);
23+
constexpr bool LetDeduceArgs = true;
24+
Passed &= tests<8, 8, fp16, fp16, LetDeduceArgs>(Q, Print);
25+
Passed &= tests<8, 4, fp16, fp16, LetDeduceArgs>(Q, Print);
26+
Passed &= tests<8, 1, fp16, fp16, LetDeduceArgs>(Q, Print);
2727

2828
// TODO: Enable these cases when esimd::simd(ptr) constructor is fixed.
29-
// Passed &= tests<8, 5, fp16, fp16>(Q, Print);
30-
// Passed &= tests<8, 3, fp16, fp16>(Q, Print);
29+
// Passed &= tests<8, 5, fp16, fp16, LetDeduceArgs>(Q, Print);
30+
// Passed &= tests<8, 3, fp16, fp16, LetDeduceArgs>(Q, Print);
3131

3232
std::cout << (Passed ? "Test Passed\n" : "Test FAILED\n");
3333
return Passed ? 0 : 1;

SYCL/ESIMD/dpas/dpas_int.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
77
//===----------------------------------------------------------------------===//
8-
// REQUIRES: gpu-intel-pvc || esimd_emulator
8+
// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 || esimd_emulator
99
// UNSUPPORTED: cuda || hip
10-
// RUN: %clangxx -fsycl %s -o %t.out
10+
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel %s -o %t.out
1111
// RUN: %GPU_RUN_PLACEHOLDER %t.out
1212

1313
// This test verifies DPAS support for 2,4,8-bit integers.
@@ -20,7 +20,9 @@ int main(int argc, const char *argv[]) {
2020
bool Print = argc > 1 && std::string(argv[1]) == "-debug";
2121
bool Passed = true;
2222

23-
// Test unsigned 2-bit integers./
23+
constexpr bool LetDeduceArgs = true;
24+
25+
// Test unsigned 2-bit integers.
2426
Passed &= tests<8, 8, u2, u2>(Q, Print);
2527
Passed &= tests<8, 4, u2, u2>(Q, Print);
2628
// TODO: enable this case when the problem with simd constructor
@@ -54,7 +56,7 @@ int main(int argc, const char *argv[]) {
5456

5557
// Test couple combinations with 8-bit integers.
5658
Passed &= tests<8, 8, s8, s8>(Q, Print);
57-
Passed &= tests<8, 2, u8, s8>(Q, Print);
59+
Passed &= tests<8, 2, u8, s8, LetDeduceArgs>(Q, Print);
5860

5961
// Test some mixes of 2/4/8-bit integers.
6062
Passed &= tests<8, 8, s2, s4>(Q, Print);

0 commit comments

Comments
 (0)