Skip to content

[Matrix][SYCL] Add bfloat16 support for joint_matrix #6113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <CL/__spirv/spirv_ops.hpp>
#include <CL/sycl/detail/defines_elementary.hpp>
#include <CL/sycl/feature_test.hpp>
#include <sycl/ext/oneapi/experimental/bfloat16.hpp>

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
Expand Down Expand Up @@ -453,6 +454,156 @@ class wi_element<uint16_t, NumRows, NumCols, Layout, Group> {
#undef OP
};

template <size_t NumRows, size_t NumCols, matrix_layout Layout, typename Group>
class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
Layout, Group> {
joint_matrix<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
Layout, Group> &M;
std::size_t idx;

public:
wi_element(joint_matrix<sycl::ext::oneapi::experimental::bfloat16, NumRows,
NumCols, Layout, Group> &Mat,
std::size_t i)
: M(Mat), idx(i) {}
operator sycl::ext::oneapi::experimental::bfloat16() {
#ifdef __SYCL_DEVICE_ONLY__
return __spirv_VectorExtractDynamic(M.spvm, idx);
#else
throw runtime_error("joint matrix is not supported on host device.",
PI_INVALID_DEVICE);
#endif // __SYCL_DEVICE_ONLY__
}

explicit operator bool() {
#ifdef __SYCL_DEVICE_ONLY__
return std::fabs(static_cast<float>(__spirv_VectorExtractDynamic(
M.spvm, idx))) >= std::numeric_limits<float>::epsilon();
#else
throw runtime_error("joint matrix is not supported on host device.",
PI_INVALID_DEVICE);
#endif // __SYCL_DEVICE_ONLY__
}

wi_element &operator=(const sycl::ext::oneapi::experimental::bfloat16 &rhs) {
#ifdef __SYCL_DEVICE_ONLY__
M.spvm = __spirv_VectorInsertDynamic(M.spvm, rhs, idx);
return *this;
#else
(void)rhs;
throw runtime_error("joint matrix is not supported on host device.",
PI_INVALID_DEVICE);
#endif // __SYCL_DEVICE_ONLY__
}

wi_element &
operator=(const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,
NumCols, Layout, Group> &rhs) {
#ifdef __SYCL_DEVICE_ONLY__
M.spvm = __spirv_VectorInsertDynamic(
M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx);
return *this;
#else
(void)rhs;
throw runtime_error("joint matrix is not supported on host device.",
PI_INVALID_DEVICE);
#endif // __SYCL_DEVICE_ONLY__
}

#if __SYCL_DEVICE_ONLY__
#define OP(opassign, op) \
wi_element &operator opassign( \
const sycl::ext::oneapi::experimental::bfloat16 &rhs) { \
M.spvm = __spirv_VectorInsertDynamic( \
M.spvm, __spirv_VectorExtractDynamic(M.spvm, idx) op rhs, idx); \
return *this; \
}
#else // __SYCL_DEVICE_ONLY__
#define OP(opassign, op) \
wi_element &operator opassign( \
const sycl::ext::oneapi::experimental::bfloat16 &rhs) { \
(void)rhs; \
throw runtime_error("joint matrix is not supported on host device.", \
PI_INVALID_DEVICE); \
}
#endif // __SYCL_DEVICE_ONLY__
OP(+=, +)
OP(-=, -)
OP(*=, *)
OP(/=, /)
#undef OP

#if __SYCL_DEVICE_ONLY__
#define OP(type, op) \
friend type operator op( \
const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, \
NumCols, Layout, Group> &lhs, \
const sycl::ext::oneapi::experimental::bfloat16 &rhs) { \
return __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx) op rhs; \
} \
friend type operator op( \
const sycl::ext::oneapi::experimental::bfloat16 &lhs, \
const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, \
NumCols, Layout, Group> &rhs) { \
return __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx) op lhs; \
}
OP(sycl::ext::oneapi::experimental::bfloat16, +)
OP(sycl::ext::oneapi::experimental::bfloat16, -)
OP(sycl::ext::oneapi::experimental::bfloat16, *)
OP(sycl::ext::oneapi::experimental::bfloat16, /)
#undef OP
#define OP(type, op) \
friend type operator op( \
const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, \
NumCols, Layout, Group> &lhs, \
const sycl::ext::oneapi::experimental::bfloat16 &rhs) { \
return type{static_cast<float>(__spirv_VectorExtractDynamic( \
lhs.M.spvm, lhs.idx)) op static_cast<float>(rhs)}; \
} \
friend type operator op( \
const sycl::ext::oneapi::experimental::bfloat16 &lhs, \
const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, \
NumCols, Layout, Group> &rhs) { \
return type{static_cast<float>(__spirv_VectorExtractDynamic( \
rhs.M.spvm, rhs.idx)) op static_cast<float>(lhs)}; \
}
OP(bool, ==)
OP(bool, !=)
OP(bool, <)
OP(bool, >)
OP(bool, <=)
OP(bool, >=)
#undef OP
#else // __SYCL_DEVICE_ONLY__
#define OP(type, op) \
friend type operator op( \
const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, \
NumCols, Layout, Group> &, \
const sycl::ext::oneapi::experimental::bfloat16 &) { \
throw runtime_error("joint matrix is not supported on host device.", \
PI_INVALID_DEVICE); \
} \
friend type operator op( \
const sycl::ext::oneapi::experimental::bfloat16 &, \
const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, \
NumCols, Layout, Group> &) { \
throw runtime_error("joint matrix is not supported on host device.", \
PI_INVALID_DEVICE); \
}
OP(sycl::ext::oneapi::experimental::bfloat16, +)
OP(sycl::ext::oneapi::experimental::bfloat16, -)
OP(sycl::ext::oneapi::experimental::bfloat16, *)
OP(sycl::ext::oneapi::experimental::bfloat16, /)
OP(bool, ==)
OP(bool, !=)
OP(bool, <)
OP(bool, >)
OP(bool, <=)
OP(bool, >=)
#undef OP
#endif // __SYCL_DEVICE_ONLY__
};

template <typename T, size_t NumRows, size_t NumCols, matrix_layout Layout,
typename Group>
class wi_data {
Expand Down
191 changes: 191 additions & 0 deletions sycl/test/matrix/matrix-bfloat16-test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
// RUN: %clangxx -fsycl -O2 %s -o %t.out
#include <CL/sycl.hpp>
#if (SYCL_EXT_ONEAPI_MATRIX == 2)
#include <iostream>

using namespace sycl::ext::oneapi::experimental::matrix;
using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;

static constexpr auto TILE_SZ = 16;
static constexpr auto TM = TILE_SZ - 1;
static constexpr auto TN = TILE_SZ - 1;
static constexpr auto TK = 2 * TILE_SZ - 2;

static constexpr auto SG_SZ = 16;

template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
public:
T *mat;

public:
T *get_data() { return mat; }
void set_data(T *data) { mat = data; }
big_matrix(T *data) : mat(data) {}
};

template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
size_t NUM_COLS_C>
void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A,
big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
size_t M = NUM_ROWS_C;
size_t N = NUM_COLS_C;
size_t K = NUM_COLS_A;
// B => K/4 x N*4, A => M x K, C => M, N
// stride should be X's cols, e.g., B's stirde = N*4
assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 2);
size_t NDRangeM = M / TM;
size_t NDRangeN = N / TN;
sycl::buffer<bfloat16, 2> bufA(A.get_data(), sycl::range<2>(M, K));
sycl::buffer<bfloat16, 2> bufB(B.get_data(), sycl::range<2>(K, N));
sycl::buffer<float, 2> bufC((float *)C.get_data(), sycl::range<2>(M, N));

sycl::queue q;
q.submit([&](sycl::handler &cgh) {
auto accC = bufC.get_access<sycl::access::mode::read_write>(cgh);
auto accA = bufA.get_access<sycl::access::mode::read_write>(cgh);
auto accB = bufB.get_access<sycl::access::mode::read_write>(cgh);

cgh.parallel_for<class imatrix>(
sycl::nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
[accA, accB, accC, M, N, K](sycl::nd_item<2> spmd_item)

{
// The submatrix API has to be accessed by all the workitems in a
// subgroup these functions will be called once by the subgroup no
// code divergence between the workitems
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sycl::ext::oneapi::sub_group sg = spmd_item.get_sub_group();
joint_matrix<bfloat16, TM, TK> sub_a(sg);
// For B, since current implementation does not support non-packed
// layout, users need to specify the updated VNNI sizes along with
// the packed_b layout. By default, the layout is row_major and size
// is (TK, TN).
joint_matrix<bfloat16, TK, TN, matrix_layout::packed_b> sub_b(sg);
joint_matrix<float, TM, TN> sub_c(sg);

// AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
// strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
joint_matrix_load(sg, sub_c,
accC.get_pointer() + (sg_startx * TM) * N +
sg_starty / SG_SZ * TN,
N, matrix_layout::row_major);
for (int k = 0; k < K / TK; k += 1) { //
joint_matrix_load(
sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK,
K, matrix_layout::row_major);
// Assuming B data is already in VNNI format.
joint_matrix_load(sg, sub_b,
accB.get_pointer() + (k * TK / 2) * (N * 2) +
sg_starty / SG_SZ * TN * 2,
N * 2, matrix_layout::packed_b);
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
}
joint_matrix_store(sg, sub_c,
accC.get_pointer() + (sg_startx * TM) * N +
sg_starty / SG_SZ * TN,
N, matrix_layout::row_major);
}); // parallel for
}).wait();
}

static constexpr size_t MATRIX_M = TM * 2;
static constexpr size_t MATRIX_N = TN * 2;
static constexpr size_t MATRIX_K = TK * 2;
bfloat16 A[MATRIX_M][MATRIX_K];
bfloat16 B[MATRIX_K / 2][MATRIX_N * 2];
unsigned short Aref[MATRIX_M][MATRIX_K];
unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2];
float C[MATRIX_M][MATRIX_N];
float D[MATRIX_M][MATRIX_N];

float make_fp32(short x) {
unsigned int y = x;
y = y << 16;
float *res = reinterpret_cast<float *>(&y);
return *res;
}

unsigned short make_bf16(float x) {
int *res = reinterpret_cast<int *>(&x);
*res = *res >> 16;
return (unsigned short)*res;
}

void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N,
int K) {
// tiling
for (int m = 0; m < M; m++)
for (int n = 0; n < N; n++) {
for (int k = 0; k < K; k++) {
short *va = (short *)(A_mem + m * K + k);
short *vb = (short *)(B_mem + k * N + n);
float acc = *((float *)(C_mem + m * N + n));
// FIXME: Should we do reduce-add in another version?
for (int i = 0; i < 2; i++) {
acc += (make_fp32(va[i]) * make_fp32(vb[i]));
}
*((float *)(C_mem + m * N + n)) = acc;
}
}
}

int main() {
for (int i = 0; i < MATRIX_M; i++) {
for (int j = 0; j < MATRIX_K; j++) {
// Ee create bfloat16 from unsigned short since float-to-bfloat's
// conversion is not allowed.
A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j)));
Aref[i][j] = make_bf16(1.0f * (i + j));
}
}
for (int i = 0; i < MATRIX_K / 2; i++) {
for (int j = 0; j < MATRIX_N * 2; j++) {
B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j)));
Bref[i][j] = make_bf16(2.0f * i + 3.0f * j);
}
}
for (int i = 0; i < MATRIX_M; i++) {
for (int j = 0; j < MATRIX_N; j++) {
C[i][j] = 1.0;
D[i][j] = 1.0;
}
}

big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A);
big_matrix<bfloat16, MATRIX_K / 2, MATRIX_N * 2> MB((bfloat16 *)&B);
matrix_multiply(MC, MA, MB);
matrix_multiply_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M,
MATRIX_N, MATRIX_K / 2);

bool res = true;
for (int i = 0; i < MATRIX_M; i++) {
for (int j = 0; j < MATRIX_N; j++) {
if (C[i][j] != D[i][j])
res = false;
}
}
if (res)
std::cout << "passed\n";
else
std::cout << "failed\n";
for (int i = 0; i < MATRIX_M; i++) {
for (int j = 0; j < MATRIX_N; j++)
std::cout << C[i][j] << ", ";
std::cout << "\n";
}
std::cout << std::endl;
for (int i = 0; i < MATRIX_M; i++) {
for (int j = 0; j < MATRIX_N; j++)
std::cout << D[i][j] << ", ";
std::cout << "\n";
}
}
#endif // (SYCL_EXT_ONEAPI_MATRIX == 2)