This repository was archived by the owner on Mar 28, 2023. It is now read-only.
forked from llvm/llvm-test-suite
-
Notifications
You must be signed in to change notification settings - Fork 130
[SYCL] Add new tests for int8 and bfloat16 automatic transpose and VNNI transform #1415
Merged
steffenlarsen
merged 5 commits into
intel:intel
from
yubingex007-a11y:row-col-major-int8-bfloat16
Dec 2, 2022
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
5d3d1c7
Add new tests for automatic transpose and VNNI transform
dkhaldi 5759d9f
fix a bug in bfloat16's testcase
yubingex007-a11y 8302263
add testcases for subB(int8, colmajor/rowmajor) and subA(int8,
yubingex007-a11y 98f24ea
address dounia's comments
yubingex007-a11y 67a7f27
fix clang-format issue
yubingex007-a11y File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
//==-- joint_matrix_bfloat16_colmajorA_colmajorB.cpp - DPC++ joint_matrix--==// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// REQUIRES: matrix | ||
|
||
// RUN: %clangxx -fsycl %s -o %t.out | ||
// RUN: %CPU_RUN_PLACEHOLDER %t.out | ||
// RUN: %GPU_RUN_PLACEHOLDER %t.out | ||
// CHECK: passed | ||
|
||
// This tests support of col major layout for matrix B which does transpose and | ||
// then VNNI transform. This is currently only available on AMX | ||
|
||
// XFAIL: gpu | ||
|
||
#include <iostream> | ||
#include <sycl/sycl.hpp> | ||
|
||
using namespace sycl; | ||
using namespace sycl::ext::oneapi::experimental::matrix; | ||
using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; | ||
|
||
#define SG_SZ 16 | ||
|
||
#include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" |
143 changes: 143 additions & 0 deletions
143
SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
#define TM 8 | ||
#define TN SG_SZ | ||
#define TK 16 | ||
#define BF16_EPSILON 0.00781250 | ||
|
||
template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix { | ||
private: | ||
T *mat; | ||
|
||
public: | ||
T *get_data() { return mat; } | ||
void set_data(T *data) { mat = data; } | ||
big_matrix(T *data) : mat(data) {} | ||
}; | ||
|
||
template <typename T1, typename T2, size_t M, size_t N, size_t K> | ||
void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A, | ||
big_matrix<T2, K, N> &B) { | ||
size_t NDRangeM = M / TM; | ||
size_t NDRangeN = N / TN; | ||
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K)); | ||
buffer<bfloat16, 2> bufB(B.get_data(), range<2>(K, N)); | ||
buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N)); | ||
|
||
queue q; | ||
q.submit([&](handler &cgh) { | ||
auto accC = bufC.get_access<access::mode::read_write>(cgh); | ||
auto accA = bufA.get_access<access::mode::read_write>(cgh); | ||
auto accB = bufB.get_access<access::mode::read_write>(cgh); | ||
|
||
cgh.parallel_for<class imatrix>( | ||
nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), | ||
[=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] | ||
|
||
{ | ||
// The submatrix API has to be accessed by all the workitems in a | ||
// subgroup these functions will be called once by the subgroup no | ||
// code divergence between the workitems | ||
const auto global_idx = spmd_item.get_global_id(0); | ||
const auto global_idy = spmd_item.get_global_id(1); | ||
const auto sg_startx = global_idx - spmd_item.get_local_id(0); | ||
const auto sg_starty = global_idy - spmd_item.get_local_id(1); | ||
|
||
ext::oneapi::sub_group sg = spmd_item.get_sub_group(); | ||
joint_matrix<bfloat16, TM, TK> sub_a(sg); | ||
joint_matrix<bfloat16, TK, TN, matrix_layout::packed_b> sub_b(sg); | ||
joint_matrix<float, TM, TN> sub_c(sg); | ||
|
||
joint_matrix_load(sg, sub_c, | ||
accC.get_pointer() + (sg_startx * TM) * N + | ||
sg_starty / SG_SZ * TN, | ||
N, matrix_layout::row_major); | ||
for (int k = 0; k < K / TK; k += 1) { // | ||
joint_matrix_load( | ||
sg, sub_a, accA.get_pointer() + (k * TK) * M + sg_startx * TM, | ||
M, matrix_layout::col_major); | ||
joint_matrix_load(sg, sub_b, | ||
accB.get_pointer() + | ||
(sg_starty / SG_SZ * TN) * K + k * TK, | ||
K, matrix_layout::col_major); | ||
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); | ||
} | ||
joint_matrix_store(sg, sub_c, | ||
accC.get_pointer() + (sg_startx * TM) * N + | ||
sg_starty / SG_SZ * TN, | ||
N, matrix_layout::row_major); | ||
}); // parallel for | ||
}).wait(); | ||
} | ||
|
||
static constexpr size_t MATRIX_M = TM * 2; | ||
static constexpr size_t MATRIX_N = TN * 2; | ||
static constexpr size_t MATRIX_K = TK * 2; | ||
bfloat16 A[MATRIX_K][MATRIX_M]; | ||
bfloat16 B[MATRIX_N][MATRIX_K]; | ||
unsigned short Aref[MATRIX_K][MATRIX_M]; | ||
unsigned short Bref[MATRIX_N][MATRIX_K]; | ||
float C[MATRIX_M][MATRIX_N]; | ||
float D[MATRIX_M][MATRIX_N]; | ||
|
||
float make_fp32(short x) { | ||
unsigned int y = x; | ||
y = y << 16; | ||
float *res = reinterpret_cast<float *>(&y); | ||
return *res; | ||
} | ||
|
||
unsigned short make_bf16(float x) { | ||
int *res = reinterpret_cast<int *>(&x); | ||
*res = *res >> 16; | ||
return (unsigned short)*res; | ||
} | ||
|
||
void matrix_multiply_ref(int M, int N, int K) { | ||
for (int m = 0; m < M; m++) | ||
for (int n = 0; n < N; n++) { | ||
for (int k = 0; k < K; k++) { | ||
D[m][n] += make_fp32(Aref[k][m]) * make_fp32(Bref[n][k]); | ||
} | ||
} | ||
} | ||
|
||
int main() { | ||
for (int i = 0; i < MATRIX_K; i++) { | ||
for (int j = 0; j < MATRIX_M; j++) { | ||
// bfloat16 is created using unsigned short since conversion from float to | ||
// bfloat16 is not supported on the host side yet | ||
A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j))); | ||
Aref[i][j] = make_bf16(1.0f * (i + j)); | ||
} | ||
} | ||
for (int i = 0; i < MATRIX_N; i++) { | ||
for (int j = 0; j < MATRIX_K; j++) { | ||
B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j))); | ||
Bref[i][j] = make_bf16(2.0f * i + 3.0f * j); | ||
} | ||
} | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_N; j++) { | ||
C[i][j] = 1.0; | ||
D[i][j] = 1.0; | ||
} | ||
} | ||
|
||
big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C); | ||
big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D); | ||
big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A); | ||
big_matrix<bfloat16, MATRIX_K, MATRIX_N> MB((bfloat16 *)&B); | ||
matrix_multiply(MC, MA, MB); | ||
matrix_multiply_ref(MATRIX_M, MATRIX_N, MATRIX_K); | ||
|
||
bool res = true; | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_N; j++) { | ||
if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON) | ||
res = false; | ||
} | ||
} | ||
if (res) | ||
std::cout << "passed\n"; | ||
else | ||
std::cout << "failed\n"; | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
//==--joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp - DPC++ joint_matrix---==// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// REQUIRES: matrix | ||
|
||
// RUN: %clangxx -fsycl %s -o %t.out | ||
// RUN: %CPU_RUN_PLACEHOLDER %t.out | ||
// RUN: %GPU_RUN_PLACEHOLDER %t.out | ||
// CHECK: passed | ||
|
||
// This tests support of row major layout for matrix B which does automatic VNNI | ||
// transform. This is currently only available on AMX | ||
|
||
// XFAIL: gpu | ||
|
||
#include <iostream> | ||
#include <sycl/sycl.hpp> | ||
|
||
using namespace sycl; | ||
using namespace sycl::ext::oneapi::experimental::matrix; | ||
using bfloat16 = sycl::ext::oneapi::experimental::bfloat16; | ||
|
||
#define SG_SZ 16 | ||
|
||
#include "joint_matrix_bfloat16_rowmajorA_rowmajorB_impl.hpp" |
143 changes: 143 additions & 0 deletions
143
SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB_impl.hpp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
#define TM 8 | ||
#define TN SG_SZ | ||
#define TK 16 | ||
#define BF16_EPSILON 0.00781250 | ||
|
||
template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix { | ||
private: | ||
T *mat; | ||
|
||
public: | ||
T *get_data() { return mat; } | ||
void set_data(T *data) { mat = data; } | ||
big_matrix(T *data) : mat(data) {} | ||
}; | ||
|
||
template <typename T1, typename T2, size_t M, size_t N, size_t K> | ||
void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A, | ||
big_matrix<T2, K, N> &B) { | ||
size_t NDRangeM = M / TM; | ||
size_t NDRangeN = N / TN; | ||
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K)); | ||
buffer<bfloat16, 2> bufB(B.get_data(), range<2>(K, N)); | ||
buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N)); | ||
|
||
queue q; | ||
q.submit([&](handler &cgh) { | ||
auto accC = bufC.get_access<access::mode::read_write>(cgh); | ||
auto accA = bufA.get_access<access::mode::read_write>(cgh); | ||
auto accB = bufB.get_access<access::mode::read_write>(cgh); | ||
|
||
cgh.parallel_for<class imatrix>( | ||
nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), | ||
[=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] | ||
|
||
{ | ||
// The submatrix API has to be accessed by all the workitems in a | ||
// subgroup these functions will be called once by the subgroup no | ||
// code divergence between the workitems | ||
const auto global_idx = spmd_item.get_global_id(0); | ||
const auto global_idy = spmd_item.get_global_id(1); | ||
const auto sg_startx = global_idx - spmd_item.get_local_id(0); | ||
const auto sg_starty = global_idy - spmd_item.get_local_id(1); | ||
|
||
ext::oneapi::sub_group sg = spmd_item.get_sub_group(); | ||
joint_matrix<bfloat16, TM, TK> sub_a(sg); | ||
joint_matrix<bfloat16, TK, TN, matrix_layout::packed_b> sub_b(sg); | ||
joint_matrix<float, TM, TN> sub_c(sg); | ||
|
||
joint_matrix_load(sg, sub_c, | ||
accC.get_pointer() + (sg_startx * TM) * N + | ||
sg_starty / SG_SZ * TN, | ||
N, matrix_layout::row_major); | ||
for (int k = 0; k < K / TK; k += 1) { | ||
joint_matrix_load( | ||
sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, | ||
K, matrix_layout::row_major); | ||
joint_matrix_load(sg, sub_b, | ||
accB.get_pointer() + (k * TK) * (N) + | ||
sg_starty / SG_SZ * TN, | ||
N, matrix_layout::row_major); | ||
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); | ||
} | ||
joint_matrix_store(sg, sub_c, | ||
accC.get_pointer() + (sg_startx * TM) * N + | ||
sg_starty / SG_SZ * TN, | ||
N, matrix_layout::row_major); | ||
}); // parallel for | ||
}).wait(); | ||
} | ||
|
||
static constexpr size_t MATRIX_M = TM * 2; | ||
static constexpr size_t MATRIX_N = TN * 2; | ||
static constexpr size_t MATRIX_K = TK * 2; | ||
bfloat16 A[MATRIX_M][MATRIX_K]; | ||
bfloat16 B[MATRIX_K][MATRIX_N]; | ||
unsigned short Aref[MATRIX_M][MATRIX_K]; | ||
unsigned short Bref[MATRIX_K][MATRIX_N]; | ||
float C[MATRIX_M][MATRIX_N]; | ||
float D[MATRIX_M][MATRIX_N]; | ||
|
||
float make_fp32(short x) { | ||
unsigned int y = x; | ||
y = y << 16; | ||
float *res = reinterpret_cast<float *>(&y); | ||
return *res; | ||
} | ||
|
||
unsigned short make_bf16(float x) { | ||
int *res = reinterpret_cast<int *>(&x); | ||
*res = *res >> 16; | ||
return (unsigned short)*res; | ||
} | ||
|
||
void matrix_multiply_ref(int M, int N, int K) { | ||
for (int m = 0; m < M; m++) | ||
for (int n = 0; n < N; n++) { | ||
for (int k = 0; k < K; k++) { | ||
D[m][n] += make_fp32(Aref[m][k]) * make_fp32(Bref[k][n]); | ||
} | ||
} | ||
} | ||
|
||
int main() { | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_K; j++) { | ||
// bfloat16 is created using unsigned short since conversion from float to | ||
// bfloat16 is not supported on the host side yet | ||
A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j))); | ||
Aref[i][j] = make_bf16(1.0f * (i + j)); | ||
} | ||
} | ||
for (int i = 0; i < MATRIX_K /*/ 2*/; i++) { | ||
for (int j = 0; j < MATRIX_N /** 2*/; j++) { | ||
B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j))); | ||
Bref[i][j] = make_bf16(2.0f * i + 3.0f * j); | ||
} | ||
} | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_N; j++) { | ||
C[i][j] = 1.0; | ||
D[i][j] = 1.0; | ||
} | ||
} | ||
|
||
big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C); | ||
big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D); | ||
big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A); | ||
big_matrix<bfloat16, MATRIX_K, MATRIX_N> MB((bfloat16 *)&B); | ||
matrix_multiply(MC, MA, MB); | ||
matrix_multiply_ref(MATRIX_M, MATRIX_N, MATRIX_K); | ||
|
||
bool res = true; | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_N; j++) { | ||
if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON) | ||
res = false; | ||
} | ||
} | ||
if (res) | ||
std::cout << "passed\n"; | ||
else | ||
std::cout << "failed\n"; | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
//==----- joint_matrix_int8_colmajorA_colmajorB.cpp - DPC++ joint_matrix---==// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// REQUIRES: matrix | ||
|
||
// RUN: %clangxx -fsycl %s -o %t.out | ||
// RUN: %CPU_RUN_PLACEHOLDER %t.out | ||
// RUN: %GPU_RUN_PLACEHOLDER %t.out | ||
// CHECK: passed | ||
|
||
// This tests support of col major layout for matrix B which does transpose and | ||
// then VNNI transform. This is currently only available on AMX | ||
|
||
// XFAIL: gpu | ||
|
||
#include <iostream> | ||
#include <sycl/sycl.hpp> | ||
|
||
using namespace sycl; | ||
using namespace sycl::ext::oneapi::experimental::matrix; | ||
|
||
#define SG_SZ 16 | ||
|
||
#include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp" |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This only has an effect if you run the output through FileCheck. It would be easier to just have the main functions return a non-0 value if they fail.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe later we change all of it. other tesctcases have such issue as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this is indeed a problem for the other tests, I am okay with merging it as-is, but it should be addressed ASAP. Note that anything that can cause "failed" to be printed will not currently be considered a failure by the test system for these tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Patch for fixing this: #1432