Skip to content

Commit 1c2587d

Browse files
authored
[SYCL][Joint Matrix][E2E] Add tests for big shapes for col major A and B loads/stores (#16999)
1 parent a064301 commit 1c2587d

File tree

6 files changed

+138
-26
lines changed

6 files changed

+138
-26
lines changed

sycl/test-e2e/Matrix/Inputs/common.hpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
77
//===----------------------------------------------------------------------===//
8+
#include <bitset>
89
#include <cmath>
10+
#include <iomanip>
911
#include <iostream>
1012
#include <random>
1113
#include <sycl/detail/core.hpp>
@@ -18,6 +20,18 @@ namespace syclex = sycl::ext::oneapi::experimental;
1820
namespace syclintelex = sycl::ext::intel::experimental;
1921
using bfloat16 = sycl::ext::oneapi::bfloat16;
2022

23+
void print_float_as_hex(float value) {
24+
union {
25+
float f;
26+
uint32_t i;
27+
} v;
28+
v.f = value;
29+
30+
std::ios_base::fmtflags f(std::cout.flags());
31+
std::cout << std::hex << std::setw(8) << std::setfill('0') << v.i;
32+
std::cout.flags(f);
33+
}
34+
2135
// Most of the time, failures related to floating-point calculations (both float
2236
// and bfloat16) are caused by accumulation errors rather than the algorithm
2337
// itself. If it is an algorithm issue, the calculated result gap from the
@@ -223,13 +237,19 @@ template <typename KernelName> size_t get_sg_size(queue q) {
223237
}
224238

225239
template <typename T>
226-
void matrix_print(unsigned int rows, unsigned int cols, T *mat) {
240+
void matrix_print(unsigned int rows, unsigned int cols, T *mat,
241+
bool hex = false) {
227242
for (unsigned int i = 0; i < rows; i++) {
228243
for (unsigned int j = 0; j < cols; j++) {
229244
if constexpr (std::is_integral_v<T>)
230245
std::cout << (int)mat[i * cols + j] << " ";
231-
else
232-
std::cout << (float)mat[i * cols + j] << " ";
246+
else {
247+
if (hex)
248+
print_float_as_hex((float)mat[i * cols + j]);
249+
else
250+
std::cout << (float)mat[i * cols + j];
251+
std::cout << " ";
252+
}
233253
}
234254
std::cout << "\n";
235255
}

sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
#include <iostream>
1010
#include <sycl/usm.hpp>
1111

12-
template <typename Tab, size_t K, layout B_layout> class mult;
12+
template <typename Tab, size_t TM, size_t TN, size_t TK, layout B_layout>
13+
class mult;
1314

1415
template <typename T1, typename T2, size_t M, size_t N, size_t K, size_t TM,
1516
size_t TN, size_t TK, layout A_layout, layout B_layout>
@@ -18,11 +19,11 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
1819
// Add one iteration for the out of bounds dpas instruction
1920
size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0);
2021
size_t NDRangeN = N / TN + (((N % TN) != 0) ? 1 : 0);
21-
size_t sg_size = get_sg_size<mult<T2, K, B_layout>>(q);
22+
size_t sg_size = get_sg_size<mult<T2, TM, TN, TK, B_layout>>(q);
2223
std::cout << "SG size: " << sg_size << " ";
2324

2425
q.submit([&](handler &cgh) {
25-
cgh.parallel_for<mult<T2, K, B_layout>>(
26+
cgh.parallel_for<mult<T2, TM, TN, TK, B_layout>>(
2627
nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
2728
[=](nd_item<2> spmd_item)
2829
#ifdef SG_SZ
@@ -147,13 +148,38 @@ void test() {
147148

148149
template <layout A_layout, layout B_layout> void test_all() {
149150
std::cout << "bf16: ";
150-
test<bfloat16, float, /*MATRIX_M*/ 1024 + 20, /*MATRIX_N*/ 1024 + 20,
151+
test<bfloat16, float, /*MATRIX_M*/ 1024 + 24, /*MATRIX_N*/ 1024 + 24,
151152
/*MATRIX_K*/ 1024 + 24, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, A_layout,
152153
B_layout>();
153154
std::cout << "half: ";
154-
test<half, float, 1024 + 20, 1024 + 20, 1024 + 24, 8, 16, 16, A_layout,
155+
test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 8, 16, 16, A_layout,
155156
B_layout>();
156157
std::cout << "int8: ";
157-
test<int8_t, int32_t, 1024, 1024 + 20, 1024 + 24, 8, 16, 32, A_layout,
158+
test<int8_t, int32_t, 1024, 1024, 1024 + 16, 8, 16, 32, A_layout, B_layout>();
159+
}
160+
161+
template <layout A_layout, layout B_layout> void test_all_big_shapes() {
162+
std::cout << "bf16: ";
163+
test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 16, 16, 16, A_layout,
164+
B_layout>();
165+
test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 1, 64, 16, A_layout,
166+
B_layout>();
167+
test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 1, 64, 32, A_layout,
168+
B_layout>();
169+
test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 32, 64, 16, A_layout,
170+
B_layout>();
171+
test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 32, 64, 32, A_layout,
172+
B_layout>();
173+
174+
std::cout << "half: ";
175+
test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 16, 16, 16, A_layout,
176+
B_layout>();
177+
test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 1, 64, 16, A_layout,
178+
B_layout>();
179+
test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 1, 64, 32, A_layout,
180+
B_layout>();
181+
test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 32, 64, 16, A_layout,
182+
B_layout>();
183+
test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 32, 64, 32, A_layout,
158184
B_layout>();
159185
}

sycl/test-e2e/Matrix/joint_matrix_16bit_colmajorA_colmajorB.cpp

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,10 @@
2929

3030
#include "common.hpp"
3131

32-
constexpr size_t TM = 8;
33-
constexpr size_t TN = 16;
34-
constexpr size_t TK = 16;
32+
template <typename T, size_t TM, size_t TN, size_t TK> class imatrix;
3533

36-
template <typename T> class imatrix;
37-
38-
template <typename T1, typename T2, size_t M, size_t N, size_t K>
34+
template <size_t TM, size_t TN, size_t TK, typename T1, typename T2, size_t M,
35+
size_t N, size_t K>
3936
void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
4037
big_matrix<T2, K, N> &B) {
4138
size_t NDRangeM = M / TM;
@@ -45,15 +42,15 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
4542
buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
4643

4744
queue q;
48-
size_t sg_size = get_sg_size<class imatrix<T2>>(q);
45+
size_t sg_size = get_sg_size<class imatrix<T2, TM, TN, TK>>(q);
4946
std::cout << "subgroup size " << sg_size << " ";
5047

5148
q.submit([&](handler &cgh) {
5249
auto accC = bufC.get_access<access::mode::read_write>(cgh);
5350
auto accA = bufA.template get_access<access::mode::read_write>(cgh);
5451
auto accB = bufB.template get_access<access::mode::read_write>(cgh);
5552

56-
cgh.parallel_for<class imatrix<T2>>(
53+
cgh.parallel_for<class imatrix<T2, TM, TN, TK>>(
5754
nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
5855
[=](nd_item<2> spmd_item)
5956
#ifdef SG_SZ
@@ -100,10 +97,13 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
10097
}).wait();
10198
}
10299

103-
template <typename T> void test() {
100+
template <typename T, size_t TM, size_t TN, size_t TK> void test() {
101+
std::cout << TM << "x" << TN << "x" << TK << " ";
102+
104103
static constexpr size_t MATRIX_M = TM * 2;
105104
static constexpr size_t MATRIX_N = TN * 2;
106105
static constexpr size_t MATRIX_K = TK * 2;
106+
107107
T A[MATRIX_K][MATRIX_M];
108108
T B[MATRIX_N][MATRIX_K];
109109
float C[MATRIX_M][MATRIX_N];
@@ -120,7 +120,7 @@ template <typename T> void test() {
120120
big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
121121
big_matrix<T, MATRIX_M, MATRIX_K> MA((T *)&A);
122122
big_matrix<T, MATRIX_K, MATRIX_N> MB((T *)&B);
123-
matrix_multiply(MC, MA, MB);
123+
matrix_multiply<TM, TN, TK>(MC, MA, MB);
124124
matrix_multiply_ref((T *)A, (T *)B, (float *)D, MATRIX_M, MATRIX_N, MATRIX_K,
125125
false, true, true);
126126

@@ -138,13 +138,27 @@ int main() {
138138
for (auto &combination : combinations) {
139139
if (!bf16_run && combination.atype == matrix_type::bf16) {
140140
std::cout << "bf16 ";
141-
test<bfloat16>();
141+
test<bfloat16, 8, 16, 16>();
142+
#ifdef BIG_SHAPES
143+
test<bfloat16, 16, 16, 16>();
144+
test<bfloat16, 1, 64, 16>();
145+
test<bfloat16, 1, 64, 32>();
146+
test<bfloat16, 32, 64, 16>();
147+
test<bfloat16, 32, 64, 32>();
148+
#endif
142149
bf16_run = true;
143150
}
144151

145152
if (!half_run && combination.atype == matrix_type::fp16) {
146153
std::cout << "half ";
147-
test<half>();
154+
test<half, 8, 16, 16>();
155+
#ifdef BIG_SHAPES
156+
test<half, 16, 16, 16>();
157+
test<half, 1, 64, 16>();
158+
test<half, 1, 64, 32>();
159+
test<half, 32, 64, 16>();
160+
test<half, 32, 64, 32>();
161+
#endif
148162
half_run = true;
149163
}
150164

sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
// other triples
1111

1212
// REQUIRES: aspect-ext_intel_matrix
13-
// XFAIL: arch-intel_gpu_bmg_g21
14-
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16922
1513
// UNSUPPORTED: gpu-intel-dg2, cpu
1614
// UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW
1715

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
//==----joint_matrix_out_bounds_big_shapes.cpp - DPC++ joint_matrix---------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// UNSUPPORTED: target-nvidia, target-amd
9+
// UNSUPPORTED-INTENDED: aspect-ext_intel_matrix isn't currently supported for
10+
// other triples
11+
12+
// REQUIRES: aspect-ext_intel_matrix
13+
14+
// UNSUPPORTED: gpu-intel-dg2, cpu
15+
// UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW
16+
17+
// RUN: %{build} -o %t.out
18+
// RUN: %{run} %t.out
19+
20+
// RUN: %{build} -o %t32.out -DSG_SZ=32
21+
// RUN: %{run} %t32.out
22+
23+
// XFAIL:gpu
24+
// XFAIL-TRACKER: GSD-5768
25+
26+
#include "common.hpp"
27+
#include "joint_matrix_out_bounds_impl.hpp"
28+
29+
int main() {
30+
std::cout << "A row major, B row major:\n";
31+
test_all_big_shapes<layout::row_major, layout::row_major>();
32+
std::cout << "A row major, B packed:\n";
33+
test_all_big_shapes<layout::row_major, layout::ext_intel_packed>();
34+
std::cout << "A col major, B col major:\n";
35+
test_all_big_shapes<layout::col_major, layout::col_major>();
36+
}

sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,17 @@
2727
#include "common.hpp"
2828
#include <sycl/usm.hpp>
2929

30-
template <typename T, size_t TileRows, size_t TileCols> class MT;
30+
template <typename T, size_t TileRows, size_t TileCols, use Use> class MT;
3131

3232
template <size_t TR, size_t TC, typename T, size_t NR, size_t NC, use Use>
3333
void matrix_transpose(T *in, T *out, queue q) {
3434
static_assert((NR % TR) == 0);
3535
static_assert((NC % TC) == 0);
36-
size_t sg_size = get_sg_size<class MT<T, TR, TC>>(q);
36+
size_t sg_size = get_sg_size<class MT<T, TR, TC, Use>>(q);
3737
std::cout << "SG size " << sg_size << " ";
3838

3939
q.submit([&](handler &cgh) {
40-
cgh.parallel_for<class MT<T, TR, TC>>(
40+
cgh.parallel_for<class MT<T, TR, TC, Use>>(
4141
nd_range<2>({NR / TR, NC / TC * sg_size}, {1, 1 * sg_size}),
4242
[=](nd_item<2> spmd_item)
4343
#ifdef SG_SZ
@@ -112,13 +112,31 @@ int main() {
112112
std::cout << "bf16:\n";
113113
test<bfloat16, 8, 16, use::a>();
114114
test<bfloat16, 16, 16, use::b>();
115+
#ifdef MORE_SHAPES
116+
test<bfloat16, 1, 16, use::a>();
117+
test<bfloat16, 1, 32, use::a>();
118+
test<bfloat16, 16, 16, use::a>();
119+
test<bfloat16, 32, 16, use::a>();
120+
test<bfloat16, 32, 32, use::a>();
121+
test<bfloat16, 16, 64, use::b>();
122+
test<bfloat16, 32, 64, use::b>();
123+
#endif
115124
bf16_run = true;
116125
}
117126

118127
if (!half_run && combination.atype == matrix_type::fp16) {
119128
std::cout << "half:\n";
120129
test<half, 8, 16, use::a>();
121130
test<half, 16, 16, use::b>();
131+
#ifdef MORE_SHAPES
132+
test<half, 1, 16, use::a>();
133+
test<half, 1, 32, use::a>();
134+
test<half, 16, 16, use::a>();
135+
test<half, 32, 16, use::a>();
136+
test<half, 32, 32, use::a>();
137+
test<half, 16, 64, use::b>();
138+
test<half, 32, 64, use::b>();
139+
#endif
122140
half_run = true;
123141
}
124142

0 commit comments

Comments
 (0)