Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][Matrix] Adding test cases for the joint_matrix_apply() and fixing namespace for get_wi_data() #1636

Merged
merged 20 commits into from
Mar 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions SYCL/Matrix/XMX8/joint_matrix_apply_bf16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//==----------- joint_matrix_apply_bf16.cpp - DPC++ joint_matrix-----------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: matrix-xmx8

// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

#include <iostream>
#include <random>
#include <sycl/sycl.hpp>

using namespace sycl;
using namespace sycl::ext::oneapi::experimental::matrix;
using bfloat16 = sycl::ext::oneapi::bfloat16;

#define SG_SZ 8

#include "../joint_matrix_apply_bf16_impl.hpp"
15 changes: 10 additions & 5 deletions SYCL/Matrix/element_wise_all_ops_bf16_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, bfloat16(5.0));

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] + bfloat16(2);
}
Expand Down Expand Up @@ -86,7 +87,8 @@ void matrix_verify_sub(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, bfloat16(5.0));

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] - bfloat16(2);
}
Expand Down Expand Up @@ -119,7 +121,8 @@ void matrix_verify_mul(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;
joint_matrix_fill(sg, sub_a, bfloat16(5.0));

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] * bfloat16(3.0);
}
Expand Down Expand Up @@ -153,7 +156,8 @@ void matrix_verify_div(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, bfloat16(4.0));

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] / bfloat16(2.0);
}
Expand Down Expand Up @@ -186,7 +190,8 @@ void matrix_verify_logic(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, bfloat16(5.0));

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
if (wi_slice_a[i]) {
if (wi_slice_a[i] > bfloat16(2.0) ||
Expand Down
15 changes: 10 additions & 5 deletions SYCL/Matrix/element_wise_all_ops_half_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 5);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] + static_cast<half>(2);
}
Expand Down Expand Up @@ -77,7 +78,8 @@ void matrix_verify_sub(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 5);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] - static_cast<half>(2);
}
Expand Down Expand Up @@ -111,7 +113,8 @@ void matrix_verify_mul(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 5);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] * static_cast<half>(3.0);
}
Expand Down Expand Up @@ -145,7 +148,8 @@ void matrix_verify_div(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 4);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] / static_cast<half>(2.0);
}
Expand Down Expand Up @@ -179,7 +183,8 @@ void matrix_verify_logic(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 5);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
if (wi_slice_a[i]) {
if (wi_slice_a[i] > static_cast<half>(2.0) ||
Expand Down
15 changes: 10 additions & 5 deletions SYCL/Matrix/element_wise_all_ops_int8_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 5);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] + 2;
}
Expand Down Expand Up @@ -77,7 +78,8 @@ void matrix_verify_sub(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 5);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] - 2;
}
Expand Down Expand Up @@ -111,7 +113,8 @@ void matrix_verify_mul(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 5);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] * 3;
}
Expand Down Expand Up @@ -145,7 +148,8 @@ void matrix_verify_div(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 4);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
wi_slice_a[i] = wi_slice_a[i] / 2;
}
Expand Down Expand Up @@ -179,7 +183,8 @@ void matrix_verify_logic(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_a, 5);

auto wi_slice_a = get_wi_data(sg, sub_a);
auto wi_slice_a =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_a);
for (int i = 0; i < wi_slice_a.length(); i++) {
if (wi_slice_a[i]) {
if (wi_slice_a[i] > 2 || wi_slice_a[i] >= 2 ||
Expand Down
15 changes: 10 additions & 5 deletions SYCL/Matrix/element_wise_all_ops_int8_packed_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_b, 5);

auto wi_slice_b = get_wi_data(sg, sub_b);
auto wi_slice_b =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b);
for (int i = 0; i < wi_slice_b.length(); i++) {
wi_slice_b[i] = wi_slice_b[i] + 2;
}
Expand Down Expand Up @@ -81,7 +82,8 @@ void matrix_verify_sub(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_b, 5);

auto wi_slice_b = get_wi_data(sg, sub_b);
auto wi_slice_b =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b);
for (int i = 0; i < wi_slice_b.length(); i++) {
wi_slice_b[i] = wi_slice_b[i] - 2;
}
Expand Down Expand Up @@ -117,7 +119,8 @@ void matrix_verify_mul(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_b, 5);

auto wi_slice_b = get_wi_data(sg, sub_b);
auto wi_slice_b =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b);
for (int i = 0; i < wi_slice_b.length(); i++) {
wi_slice_b[i] = wi_slice_b[i] * 3;
}
Expand Down Expand Up @@ -153,7 +156,8 @@ void matrix_verify_div(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_b, 4);

auto wi_slice_b = get_wi_data(sg, sub_b);
auto wi_slice_b =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b);
for (int i = 0; i < wi_slice_b.length(); i++) {
wi_slice_b[i] = wi_slice_b[i] / 2;
}
Expand Down Expand Up @@ -189,7 +193,8 @@ void matrix_verify_logic(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,

joint_matrix_fill(sg, sub_b, 5);

auto wi_slice_b = get_wi_data(sg, sub_b);
auto wi_slice_b =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b);
for (int i = 0; i < wi_slice_b.length(); i++) {
if (wi_slice_b[i]) {
if (wi_slice_b[i] > 2 || wi_slice_b[i] >= 2 ||
Expand Down
3 changes: 2 additions & 1 deletion SYCL/Matrix/element_wise_irreg_sum_rows_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ void matrix_sum_rows(queue q, big_matrix<T, M, N> &B, nd_range<2> &r) {
// (tK/4)
int32_t sum_local_rows[M] = {0}; // 8 local rows, M total
// sub_b has 32x8 elements, 32 elements per WI, 4 per WI per row
auto data = get_wi_data(sg, sub_b);
auto data =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_b);

// each WI calculates local sum of rows
for (int row = 0; row < TK / 4; row++) { // there are 8 rows
Expand Down
3 changes: 2 additions & 1 deletion SYCL/Matrix/element_wise_ops_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
N * 4);
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
}
auto wi_slice_c = get_wi_data(sg, sub_c);
auto wi_slice_c =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_c);
for (int i = 0; i < wi_slice_c.length(); i++) {
wi_slice_c[i] *= 2;
}
Expand Down
3 changes: 2 additions & 1 deletion SYCL/Matrix/elemwise_irreg_size_ops_bf16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
N * 2);
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
}
auto wi_slice_c = get_wi_data(sg, sub_c);
auto wi_slice_c =
sycl::ext::intel::experimental::matrix::get_wi_data(sg, sub_c);
for (int i = 0; i < wi_slice_c.length(); i++) {
wi_slice_c[i] += 5.0;
}
Expand Down
24 changes: 24 additions & 0 deletions SYCL/Matrix/joint_matrix_apply_bf16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//==----------- joint_matrix_apply_bf16.cpp - DPC++ joint_matrix-----------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: matrix

// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

#include <iostream>
#include <random>
#include <sycl/sycl.hpp>

using namespace sycl;
using namespace sycl::ext::oneapi::experimental::matrix;
using bfloat16 = sycl::ext::oneapi::bfloat16;

#define SG_SZ 16

#include "joint_matrix_apply_bf16_impl.hpp"
95 changes: 95 additions & 0 deletions SYCL/Matrix/joint_matrix_apply_bf16_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@

#define TM 8
#define TN SG_SZ
#define TK 16

static float make_fp32(bfloat16 x) {
unsigned int y = sycl::bit_cast<uint16_t>(x);
y = y << 16;
float *res = reinterpret_cast<float *>(&y);
return *res;
}

template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
public:
T *mat;

public:
T *get_data() { return mat; }
void set_data(T *data) { mat = data; }
big_matrix(T *data) : mat(data) {}
};

template <typename T> struct apply_add {
void operator()(T &x) const { x = x + bfloat16(2); }
};

template <typename T, size_t M, size_t N, typename F>
void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
const float ref, F &&lambda) {
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, N));

q.submit([&](handler &cgh) {
accessor accA{bufA, cgh};

cgh.parallel_for(r, [accA, lambda](
nd_item<2> spmd_item) [[intel::reqd_sub_group_size(
SG_SZ)]] {
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;

joint_matrix_fill(sg, sub_a, bfloat16(5.0));

joint_matrix_apply(sg, sub_a, lambda);

ext::intel::experimental::matrix::joint_matrix_store(
sg, sub_a,
accA.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
N);
}); // parallel for
}).wait();
// Check if the results are correct
{
host_accessor Acc{bufA};
assert(std::all_of(Acc.begin(), Acc.end(), [=](auto Elem) {
return (std::fabs(static_cast<float>(make_fp32(Elem) - ref)) <
std::numeric_limits<float>::epsilon());
}));
}
}

static constexpr size_t MATRIX_M = TM * 2;
static constexpr size_t MATRIX_N = TN * 2;
bfloat16 A[MATRIX_M][MATRIX_N];
float D[MATRIX_M][MATRIX_N];

void matrix_ops_ref(float *D, int M, int N) {
for (int m = 0; m < M; m++)
for (int n = 0; n < N; n++) {
*(D + m * N + n) = 0;
*(D + m * N + n) *= 2;
}
}

int main() {

big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
big_matrix<bfloat16, MATRIX_M, MATRIX_N> MA((bfloat16 *)&A);

size_t NDRangeM = MATRIX_M / TM;
size_t NDRangeN = MATRIX_N / TN;
queue q;
nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ});

matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N>(
q, MA, r, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); });
matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N>(q, MA, r, 7.0,
apply_add<bfloat16>());
std::cout << "Passed\n";
return 0;
}