Skip to content

Commit 4f68009

Browse files
authored
[SYCL][Matrix Test] Add out of bound and unalignment cases (#9845)
1 parent fb06746 commit 4f68009

File tree

6 files changed

+260
-0
lines changed

6 files changed

+260
-0
lines changed
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//==-------- joint_matrix_out_bounds.cpp - DPC++ joint_matrix--------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: matrix-xmx8
9+
10+
// RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
11+
// RUN: %{run} %t.out
12+
13+
// XFAIL:*
14+
15+
#include "../common.hpp"
16+
17+
constexpr size_t SG_SZ = 8;
18+
constexpr size_t TN = 8;
19+
static constexpr size_t MATRIX_K = 1024 + 24;
20+
21+
#include "../joint_matrix_out_bounds_impl.hpp"
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//==-------- joint_matrix_unaligned_k.cpp - DPC++ joint_matrix--------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: matrix-xmx8
9+
10+
// RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
11+
// RUN: %{run} %t.out
12+
13+
// XFAIL:*
14+
15+
#include "../common.hpp"
16+
17+
constexpr size_t SG_SZ = 8;
18+
constexpr size_t TN = 8;
19+
constexpr size_t MATRIX_K = 1024 + 14;
20+
21+
#include "../joint_matrix_out_bounds_impl.hpp"

sycl/test-e2e/Matrix/common.hpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#include <random>
2+
#include <sycl/sycl.hpp>
3+
4+
using bfloat16 = sycl::ext::oneapi::bfloat16;
5+
6+
constexpr float BF16_EPSILON = 0.00781250;
7+
8+
template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
9+
public:
10+
T *mat;
11+
12+
public:
13+
T *get_data() { return mat; }
14+
void set_data(T *data) { mat = data; }
15+
big_matrix(T *data) : mat(data) {}
16+
};
17+
18+
float make_fp32(bfloat16 x) {
19+
unsigned int y = *((int *)&x);
20+
y = y << 16;
21+
float *res = reinterpret_cast<float *>(&y);
22+
return *res;
23+
}
24+
25+
void matrix_multiply_ref(bfloat16 *A, bfloat16 *B, float *C, int MATRIX_M,
26+
int MATRIX_N, int MATRIX_K, bool transpose_c = false) {
27+
for (unsigned int i = 0; i < MATRIX_M; i++) {
28+
for (unsigned int k = 0; k < MATRIX_K; k++) {
29+
for (unsigned int j = 0; j < MATRIX_N; j++) {
30+
int c_ind = transpose_c ? (j * MATRIX_M + i) : i * MATRIX_N + j;
31+
C[c_ind] +=
32+
make_fp32(A[i * MATRIX_K + k]) * make_fp32(B[k * MATRIX_N + j]);
33+
}
34+
}
35+
}
36+
}
37+
38+
template <typename T>
39+
void matrix_vnni(unsigned int rows, unsigned int cols, T *src, T *dest,
40+
unsigned int vnniFactor = 2) {
41+
for (unsigned int i = 0; i < rows / vnniFactor; i++) {
42+
for (unsigned int j = 0; j < cols; j++) {
43+
for (unsigned int k = 0; k < vnniFactor; k++) {
44+
dest[i * cols * vnniFactor + j * vnniFactor + k] =
45+
src[(i * vnniFactor + k) * cols + j];
46+
}
47+
}
48+
}
49+
}
50+
51+
template <typename T>
52+
void matrix_fill(unsigned int rows, unsigned int cols, T *src, T val) {
53+
for (unsigned int i = 0; i < rows; i++) {
54+
for (unsigned int j = 0; j < cols; j++) {
55+
src[i * cols + j] = val;
56+
}
57+
}
58+
}
59+
60+
template <typename T>
61+
void matrix_rand(unsigned int rows, unsigned int cols, T *src, T val) {
62+
std::random_device dev;
63+
std::uniform_real_distribution<float> fdistr(-val, val);
64+
65+
for (unsigned int i = 0; i < rows; i++) {
66+
for (unsigned int j = 0; j < cols; j++) {
67+
src[i * cols + j] = T(fdistr(dev));
68+
}
69+
}
70+
}
71+
72+
template <typename T1, typename T2>
73+
bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) {
74+
bool res = true;
75+
for (int i = 0; i < rows; i++) {
76+
for (int j = 0; j < cols; j++) {
77+
if ((fabs(src[i * cols + j] - (T1)ref[i * cols + j])) > BF16_EPSILON) {
78+
res = false;
79+
}
80+
}
81+
}
82+
return res;
83+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//==-------- joint_matrix_out_bounds.cpp - DPC++ joint_matrix--------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: matrix
9+
10+
// RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
11+
// RUN: %{run} %t.out
12+
13+
// XFAIL:*
14+
15+
#include "common.hpp"
16+
17+
constexpr size_t SG_SZ = 16;
18+
constexpr size_t TN = 16;
19+
constexpr size_t MATRIX_K = 1024 + 24;
20+
21+
#include "joint_matrix_out_bounds_impl.hpp"
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#include <iostream>
2+
3+
using namespace sycl;
4+
using namespace sycl::ext::oneapi::experimental::matrix;
5+
6+
constexpr size_t TM = 8;
7+
constexpr size_t TK = 16;
8+
9+
template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
10+
size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
11+
size_t NUM_COLS_C>
12+
void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q, unsigned int vnniFactor) {
13+
size_t M = NUM_ROWS_C;
14+
size_t N = NUM_COLS_C;
15+
size_t K = NUM_COLS_A;
16+
17+
assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * vnniFactor);
18+
// Add one iteration for the out of bounds dpas instruction
19+
size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0);
20+
size_t NDRangeN = N / TN;
21+
22+
auto pA = multi_ptr<T2, sycl::access::address_space::global_space>(A);
23+
auto pB = multi_ptr<T2, sycl::access::address_space::global_space>(B);
24+
auto pC = multi_ptr<T1, sycl::access::address_space::global_space>(C);
25+
26+
q.submit([&](handler &cgh) {
27+
cgh.parallel_for(
28+
nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
29+
[=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
30+
31+
{
32+
// The submatrix API has to be accessed by all the workitems in a
33+
// subgroup these functions will be called once by the subgroup no
34+
// code divergence between the workitems
35+
const auto global_idx = spmd_item.get_global_id(0);
36+
const auto global_idy = spmd_item.get_global_id(1);
37+
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
38+
const auto sg_starty = global_idy - spmd_item.get_local_id(1);
39+
40+
sub_group sg = spmd_item.get_sub_group();
41+
joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major>
42+
sub_a;
43+
44+
// For B, since current implementation does not support non-packed
45+
// layout, users need to specify the packed_b layout.
46+
joint_matrix<sub_group, bfloat16, use::b, TK, TN,
47+
ext::intel::experimental::matrix::layout::packed>
48+
sub_b;
49+
joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
50+
joint_matrix_fill(sg, sub_c, 1);
51+
for (int k = 0; k < K; k += TK) {
52+
joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K);
53+
// Assume we alreay in vnni format.
54+
joint_matrix_load(sg, sub_b,
55+
pB + k * N + sg_starty / SG_SZ * TN * vnniFactor,
56+
N * vnniFactor);
57+
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
58+
}
59+
joint_matrix_store(
60+
sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N,
61+
layout::row_major);
62+
}); // parallel for
63+
}).wait();
64+
}
65+
66+
int main() {
67+
static constexpr size_t MATRIX_M = 1024 + 14;
68+
static constexpr size_t MATRIX_N = 1024;
69+
static constexpr unsigned int vnniFactor = 2;
70+
71+
queue q;
72+
bfloat16 *A = malloc_shared<bfloat16>(MATRIX_M * MATRIX_K, q);
73+
bfloat16 *B = malloc_shared<bfloat16>(MATRIX_K * MATRIX_N, q);
74+
bfloat16 *vnniB = malloc_shared<bfloat16>(MATRIX_K * MATRIX_N, q);
75+
float *C = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
76+
float *D = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
77+
78+
matrix_rand(MATRIX_M, MATRIX_K, A, (bfloat16)5);
79+
matrix_rand(MATRIX_K, MATRIX_N, B, (bfloat16)5);
80+
matrix_fill(MATRIX_M, MATRIX_N, C, (float)1);
81+
matrix_fill(MATRIX_M, MATRIX_N, D, (float)1);
82+
83+
matrix_vnni<bfloat16>(MATRIX_K, MATRIX_N, B, vnniB, vnniFactor);
84+
matrix_multiply<float, bfloat16, MATRIX_M, MATRIX_K, MATRIX_K / vnniFactor,
85+
MATRIX_N * vnniFactor, MATRIX_M, MATRIX_N>(C, A, vnniB, q,
86+
vnniFactor);
87+
matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K);
88+
89+
bool res = matrix_compare(MATRIX_M, MATRIX_N, C, D);
90+
91+
std::cout << (res ? "passed" : "failed") << std::endl;
92+
return !res;
93+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//==-------- joint_matrix_unaligned_k.cpp - DPC++ joint_matrix-------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: matrix
9+
10+
// RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
11+
// RUN: %{run} %t.out
12+
13+
// XFAIL:*
14+
15+
#include "common.hpp"
16+
17+
constexpr size_t SG_SZ = 16;
18+
constexpr size_t TN = 16;
19+
static constexpr size_t MATRIX_K = 1024 + 14;
20+
21+
#include "joint_matrix_out_bounds_impl.hpp"

0 commit comments

Comments
 (0)