Skip to content

[Matrix][tests]Add tests for element wise ops on float type #9679

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//==----------- element_wise_all_ops_bf16.cpp - DPC++ joint_matrix---------==//
//==------------ element_wise_all_ops.cpp - DPC++ joint_matrix-------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
Expand All @@ -16,9 +16,10 @@

using namespace sycl;
using namespace sycl::ext::intel;
using namespace sycl::ext::oneapi;
using namespace sycl::ext::oneapi::experimental::matrix;
using bfloat16 = sycl::ext::oneapi::bfloat16;

#define SG_SZ 8

#include "../element_wise_all_ops_bf16_impl.hpp"
#include "../element_wise_all_ops_impl.hpp"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//==----------- element_wise_all_ops_bf16.cpp - DPC++ joint_matrix---------==//
//==------------ element_wise_all_ops.cpp - DPC++ joint_matrix-------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
Expand All @@ -22,4 +22,4 @@ using bfloat16 = sycl::ext::oneapi::bfloat16;

#define SG_SZ 16

#include "element_wise_all_ops_bf16_impl.hpp"
#include "element_wise_all_ops_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -25,35 +25,42 @@ void assert_ops_ref(host_accessor<T, 2, access::mode::read> C,
const float ref) {
for (size_t i = 0; i < M; i++)
for (size_t j = 0; j < N; j++) {
auto diff = make_fp32(C[i][j]) - ref;
float diff;
if constexpr (std::is_same_v<T, bfloat16>)
diff = make_fp32(C[i][j]) - ref;
else
diff = C[i][j] - ref;
assert(std::fabs(static_cast<float>(diff)) <
std::numeric_limits<float>::epsilon());
}
}
template <typename T, size_t M, size_t N>
void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
const float ref) {
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, N));
buffer<T, 2> bufA(A.get_data(), range<2>(M, N));

q.submit([&](handler &cgh) {
auto accA = bufA.get_access<access::mode::read_write>(cgh);

cgh.parallel_for<class add_matrix>(
r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
sycl::accessor accA{bufA, cgh, sycl::read_write};
cgh.parallel_for(
r, [accA](nd_item<2> spmd_item)[[intel::reqd_sub_group_size(SG_SZ)]] {
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;

joint_matrix_fill(sg, sub_a, bfloat16(5.0));

if constexpr (std::is_same_v<T, bfloat16>)
joint_matrix_fill(sg, sub_a, bfloat16(5.0));
else
joint_matrix_fill(sg, sub_a, 5);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] + bfloat16(2);
if constexpr (std::is_same_v<T, bfloat16>)
wi_slice_a[i] = wi_slice_a[i] + bfloat16(2);
else
wi_slice_a[i] = wi_slice_a[i] + 2;
}

ext::intel::experimental::matrix::joint_matrix_store(
Expand All @@ -62,154 +69,188 @@ void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
(sg_startx * TM) * N + sg_starty / SG_SZ * TN,
N);
}); // parallel for
}).wait();
})
.wait();
assert_ops_ref<T, M, N>(bufA.get_host_access(read_only), ref);
}

template <typename T, size_t M, size_t N>
void matrix_verify_sub(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
const float ref) {
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, N));
buffer<T, 2> bufA(A.get_data(), range<2>(M, N));

q.submit([&](handler &cgh) {
auto accA = bufA.get_access<access::mode::read_write>(cgh);

cgh.parallel_for<class sub_matrix>(
r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
sycl::accessor accA{bufA, cgh, sycl::read_write};
cgh.parallel_for(
r, [accA](nd_item<2> spmd_item)[[intel::reqd_sub_group_size(SG_SZ)]] {
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;

joint_matrix_fill(sg, sub_a, bfloat16(5.0));

if constexpr (std::is_same_v<T, bfloat16>)
joint_matrix_fill(sg, sub_a, bfloat16(5.0));
else
joint_matrix_fill(sg, sub_a, 5);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] - bfloat16(2);
if constexpr (std::is_same_v<T, bfloat16>)
wi_slice_a[i] = wi_slice_a[i] - bfloat16(2);
else
wi_slice_a[i] = wi_slice_a[i] - 2;
}
ext::intel::experimental::matrix::joint_matrix_store(
sg, sub_a,
accA.template get_multi_ptr<access::decorated::no>() +
(sg_startx * TM) * N + sg_starty / SG_SZ * TN,
N);
}); // parallel for
}).wait();
})
.wait();
assert_ops_ref<T, M, N>(bufA.get_host_access(read_only), ref);
}

template <typename T, size_t M, size_t N>
void matrix_verify_mul(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
const float ref) {
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, N));
buffer<T, 2> bufA(A.get_data(), range<2>(M, N));

q.submit([&](handler &cgh) {
auto accA = bufA.get_access<access::mode::read_write>(cgh);

cgh.parallel_for<class mul_matrix>(
r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
sycl::accessor accA{bufA, cgh, sycl::read_write};
cgh.parallel_for(
r, [accA](nd_item<2> spmd_item)[[intel::reqd_sub_group_size(SG_SZ)]] {
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;
joint_matrix_fill(sg, sub_a, bfloat16(5.0));

if constexpr (std::is_same_v<T, bfloat16>)
joint_matrix_fill(sg, sub_a, bfloat16(5.0));
else
joint_matrix_fill(sg, sub_a, 5);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] * bfloat16(3.0);
if constexpr (std::is_same_v<T, bfloat16>)
wi_slice_a[i] = wi_slice_a[i] * bfloat16(3.0);
else
wi_slice_a[i] = wi_slice_a[i] * 3.0;
}
ext::intel::experimental::matrix::joint_matrix_store(
sg, sub_a,
accA.template get_multi_ptr<access::decorated::no>() +
(sg_startx * TM) * N + sg_starty / SG_SZ * TN,
N);
}); // parallel for
}).wait();
})
.wait();
assert_ops_ref<T, M, N>(bufA.get_host_access(read_only), ref);
}

template <typename T, size_t M, size_t N>
void matrix_verify_div(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
const float ref) {
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, N));
buffer<T, 2> bufA(A.get_data(), range<2>(M, N));

q.submit([&](handler &cgh) {
auto accA = bufA.get_access<access::mode::read_write>(cgh);

cgh.parallel_for<class div_matrix>(
r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
sycl::accessor accA{bufA, cgh, sycl::read_write};
cgh.parallel_for(
r, [accA](nd_item<2> spmd_item)[[intel::reqd_sub_group_size(SG_SZ)]] {
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;

joint_matrix_fill(sg, sub_a, bfloat16(4.0));

if constexpr (std::is_same_v<T, bfloat16>)
joint_matrix_fill(sg, sub_a, bfloat16(4.0));
else
joint_matrix_fill(sg, sub_a, 4);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] / bfloat16(2.0);
if constexpr (std::is_same_v<T, bfloat16>)
wi_slice_a[i] = wi_slice_a[i] / bfloat16(2.0);
else
wi_slice_a[i] = wi_slice_a[i] / 2.0;
}
ext::intel::experimental::matrix::joint_matrix_store(
sg, sub_a,
accA.template get_multi_ptr<access::decorated::no>() +
(sg_startx * TM) * N + sg_starty / SG_SZ * TN,
N);
}); // parallel for
}).wait();
})
.wait();
assert_ops_ref<T, M, N>(bufA.get_host_access(read_only), ref);
}

template <typename T, size_t M, size_t N>
void matrix_verify_logic(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
const float ref) {
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, N));
buffer<T, 2> bufA(A.get_data(), range<2>(M, N));

q.submit([&](handler &cgh) {
auto accA = bufA.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<class logic_matrix>(
r, [accA](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
sycl::accessor accA{bufA, cgh, sycl::read_write};
cgh.parallel_for(
r, [accA](nd_item<2> spmd_item)[[intel::reqd_sub_group_size(SG_SZ)]] {
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;

joint_matrix_fill(sg, sub_a, bfloat16(5.0));

if constexpr (std::is_same_v<T, bfloat16>)
joint_matrix_fill(sg, sub_a, bfloat16(5.0));
else
joint_matrix_fill(sg, sub_a, 5);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
if (wi_slice_a[i]) {
if (wi_slice_a[i] > bfloat16(2.0) ||
wi_slice_a[i] >= bfloat16(2.0) ||
wi_slice_a[i] < bfloat16(2.0) ||
wi_slice_a[i] <= bfloat16(2.0)) {
T val = (wi_slice_a[i] != bfloat16(2.0)) ? wi_slice_a[i]
: bfloat16(2.0);
val = bfloat16(make_fp32(val) - static_cast<float>(1));
val = bfloat16(make_fp32(val) + static_cast<float>(1));
if (wi_slice_a[i] == bfloat16(2.0)) {
val = bfloat16(make_fp32(val) - static_cast<float>(2));
val = bfloat16(make_fp32(val) * static_cast<float>(3));
val = bfloat16(make_fp32(val) / static_cast<float>(2));

} else {
val = bfloat16(make_fp32(val) + static_cast<float>(2));
if constexpr (std::is_same_v<T, bfloat16>) {
if (wi_slice_a[i] > bfloat16(2.0) ||
wi_slice_a[i] >= bfloat16(2.0) ||
wi_slice_a[i] < bfloat16(2.0) ||
wi_slice_a[i] <= bfloat16(2.0)) {
T val = (wi_slice_a[i] != bfloat16(2.0)) ? wi_slice_a[i]
: bfloat16(2.0);
val = bfloat16(make_fp32(val) - static_cast<float>(1));
val = bfloat16(make_fp32(val) + static_cast<float>(1));
if (wi_slice_a[i] == bfloat16(2.0)) {
val = bfloat16(make_fp32(val) - static_cast<float>(2));
val = bfloat16(make_fp32(val) * static_cast<float>(3));
val = bfloat16(make_fp32(val) / static_cast<float>(2));

} else {
val = bfloat16(make_fp32(val) + static_cast<float>(2));
}
wi_slice_a[i] = val;
}
} else {
if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 2.0 ||
wi_slice_a[i] < 2.0 || wi_slice_a[i] <= 2.0) {
T val = (wi_slice_a[i] != 2.0) ? wi_slice_a[i]
: static_cast<T>(2.0);
val = val - 1;
val = val + 1;
if (wi_slice_a[i] == 2.0) {
val = val - 2;
val = val * 3;
val = val / 2;

} else {
val = val + 2;
}
wi_slice_a[i] = val;
}
wi_slice_a[i] = val;
}
}
}
Expand All @@ -219,7 +260,8 @@ void matrix_verify_logic(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
(sg_startx * TM) * N + sg_starty / SG_SZ * TN,
N);
}); // parallel for
}).wait();
})
.wait();
assert_ops_ref<T, M, N>(bufA.get_host_access(read_only), ref);
}

Expand All @@ -236,21 +278,27 @@ void matrix_ops_ref(float *D, int M, int N) {
}
}

int main() {
template <typename T, typename Tref> int test_ewops() {

big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
big_matrix<bfloat16, MATRIX_M, MATRIX_N> MA((bfloat16 *)&A);
big_matrix<Tref, MATRIX_M, MATRIX_N> MD((Tref *)&D);
big_matrix<T, MATRIX_M, MATRIX_N> MA((T *)&A);

size_t NDRangeM = MATRIX_M / TM;
size_t NDRangeN = MATRIX_N / TN;
queue q;
nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ});

matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N>(q, MA, r, 7.0);
matrix_verify_sub<bfloat16, MATRIX_M, MATRIX_N>(q, MA, r, 3.0);
matrix_verify_mul<bfloat16, MATRIX_M, MATRIX_N>(q, MA, r, 15.0);
matrix_verify_div<bfloat16, MATRIX_M, MATRIX_N>(q, MA, r, 2.0);
matrix_verify_logic<bfloat16, MATRIX_M, MATRIX_N>(q, MA, r, 7.0);
matrix_verify_add<T, MATRIX_M, MATRIX_N>(q, MA, r, 7.0);
matrix_verify_sub<T, MATRIX_M, MATRIX_N>(q, MA, r, 3.0);
matrix_verify_mul<T, MATRIX_M, MATRIX_N>(q, MA, r, 15.0);
matrix_verify_div<T, MATRIX_M, MATRIX_N>(q, MA, r, 2.0);
matrix_verify_logic<T, MATRIX_M, MATRIX_N>(q, MA, r, 7.0);

return 0;
}

int main() {
test_ewops<bfloat16, float>();
test_ewops<float, float>();
return 0;
}