Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 2834fab

Browse files
authored
[SYCL][CUDA] Unified matrix interface updated tests (#1183)
Updated all tests to use new "unified" interfaces from intel/llvm#7077. The old legacy interface implementation is deprecated but still tested via the _legacy files. Signed-off-by: JackAKirk <[email protected]>
1 parent f79467e commit 2834fab

6 files changed

+537
-8
lines changed

SYCL/Matrix/element_wise_all_ops_cuda.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
//===----------------------------------------------------------------------===//
88
// REQUIRES: cuda
99

10-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out
10+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
1111
// RUN: %t.out
1212

1313
#include <sycl/sycl.hpp>
@@ -64,9 +64,9 @@ void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
6464

6565
auto sg = spmd_item.get_sub_group();
6666

67-
joint_matrix<T, matrix_use::a, M, K> sub_a;
68-
joint_matrix<T, matrix_use::b, K, N> sub_b;
69-
joint_matrix<T2, matrix_use::accumulator, M, N> sub_c;
67+
joint_matrix<T, use::a, M, K, layout::row_major> sub_a;
68+
joint_matrix<T, use::b, K, N, layout::row_major> sub_b;
69+
joint_matrix<T2, use::accumulator, M, N> sub_c;
7070

7171
joint_matrix_fill(sg, sub_a, 3);
7272
joint_matrix_fill(sg, sub_b, 1);
@@ -104,7 +104,7 @@ void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
104104
accC.get_pointer() +
105105
(sg_startx * M) * (N * nWGperDim) +
106106
sg_starty / SG_SZ * N,
107-
(N * nWGperDim));
107+
(N * nWGperDim), layout::row_major);
108108
}); // parallel for
109109
}).wait();
110110
}
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
//==----------- element_wise_all_ops_cuda.cpp - DPC++ joint_matrix---------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: cuda
9+
10+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
11+
// RUN: %t.out
12+
13+
#include <sycl/sycl.hpp>
14+
15+
using namespace sycl;
16+
using namespace sycl::ext::oneapi::experimental::matrix;
17+
using sycl::ext::oneapi::experimental::bfloat16;
18+
19+
#define SG_SZ 32
20+
constexpr size_t nWGperDim = 2;
21+
22+
class Logical {};
23+
24+
template <typename T1, typename T2, size_t M, size_t K, size_t N, typename OP>
25+
class KernelName;
26+
27+
template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
28+
public:
29+
T *mat;
30+
31+
public:
32+
T *get_data() { return mat; }
33+
void set_data(T *data) { mat = data; }
34+
big_matrix(T *data) : mat(data) {}
35+
};
36+
37+
template <typename T, size_t M, size_t N>
38+
void assert_ops_ref(T *C, const float ref) {
39+
for (size_t i = 0; i < M; i++)
40+
for (size_t j = 0; j < N; j++) {
41+
auto diff = C[i + j * M] - ref;
42+
assert(std::fabs(static_cast<float>(diff)) <
43+
std::numeric_limits<float>::epsilon());
44+
}
45+
}
46+
template <typename T, typename T2, size_t M, size_t K, size_t N,
47+
class Operation>
48+
void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
49+
nd_range<2> &r, const float ref, Operation Op) {
50+
{
51+
buffer<T2, 2> bufC(C.get_data(), range<2>(N * nWGperDim, M * nWGperDim));
52+
53+
q.submit([&](handler &cgh) {
54+
accessor<T2, 2, access::mode::read_write, target::device> accC(bufC,
55+
cgh);
56+
57+
cgh.parallel_for<KernelName<T, T2, M, K, N, Operation>>(
58+
r, [accC,
59+
Op](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
60+
const auto global_idx = spmd_item.get_global_id(0);
61+
const auto global_idy = spmd_item.get_global_id(1);
62+
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
63+
const auto sg_starty = global_idy - spmd_item.get_local_id(1);
64+
65+
auto sg = spmd_item.get_sub_group();
66+
67+
joint_matrix<T, matrix_use::a, M, K> sub_a;
68+
joint_matrix<T, matrix_use::b, K, N> sub_b;
69+
joint_matrix<T2, matrix_use::accumulator, M, N> sub_c;
70+
71+
joint_matrix_fill(sg, sub_a, 3);
72+
joint_matrix_fill(sg, sub_b, 1);
73+
joint_matrix_fill(sg, sub_c, -80);
74+
75+
auto wi_slice_a = sub_a.get_wi_data();
76+
for (int i = 0; i < wi_slice_a.length(); i++) {
77+
if constexpr (std::is_same_v<Operation, Logical>) {
78+
if (wi_slice_a[i]) {
79+
if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 3.0 ||
80+
wi_slice_a[i] < 4.0 || wi_slice_a[i] <= 3.0) {
81+
T val = (wi_slice_a[i] != (2.0)) ? wi_slice_a[i]
82+
: static_cast<T>(2.0);
83+
val = ((val) - (1));
84+
val = ((val) + (1));
85+
if (wi_slice_a[i] == (2.0)) {
86+
val = ((val) - (2));
87+
val = ((val) * (3));
88+
val = ((val) / (2));
89+
90+
} else {
91+
val = ((val) + (2));
92+
}
93+
wi_slice_a[i] = val;
94+
}
95+
}
96+
} else {
97+
wi_slice_a[i] = Op(wi_slice_a[i], 2);
98+
}
99+
}
100+
101+
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
102+
103+
joint_matrix_store(sg, sub_c,
104+
accC.get_pointer() +
105+
(sg_startx * M) * (N * nWGperDim) +
106+
sg_starty / SG_SZ * N,
107+
(N * nWGperDim));
108+
}); // parallel for
109+
}).wait();
110+
}
111+
assert_ops_ref<T2, M * nWGperDim, N * nWGperDim>(C.get_data(), ref);
112+
}
113+
114+
static constexpr size_t MATRIX_M = 16 * nWGperDim;
115+
static constexpr size_t MATRIX_N = 16 * nWGperDim;
116+
117+
int main() {
118+
119+
float D[MATRIX_M][MATRIX_N];
120+
big_matrix<float, MATRIX_M, MATRIX_N> MD_f((float *)&D);
121+
122+
queue q;
123+
auto computeCapability =
124+
std::stof(q.get_device().get_info<info::device::backend_version>());
125+
nd_range<2> r({nWGperDim, nWGperDim * SG_SZ}, {1, 1 * SG_SZ});
126+
127+
if (computeCapability >= 7.0) {
128+
matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 0.0,
129+
std::plus<half>{});
130+
matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 0.0, Logical{});
131+
matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, 16.0,
132+
std::multiplies<half>{});
133+
matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, -56.0,
134+
std::divides<half>{});
135+
matrix_verify_op<half, float, 16, 16, 16>(q, MD_f, r, -64.0,
136+
std::minus<half>{});
137+
}
138+
139+
if (computeCapability >= 7.2) {
140+
int32_t D_i[MATRIX_M][MATRIX_N];
141+
big_matrix<int32_t, MATRIX_M, MATRIX_N> MD_i((int32_t *)&D_i);
142+
matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0,
143+
std::plus<uint8_t>{});
144+
matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, 16,
145+
std::multiplies<uint8_t>{});
146+
matrix_verify_op<uint8_t, int32_t, 16, 16, 16>(q, MD_i, r, -64,
147+
std::minus<uint8_t>{});
148+
matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0,
149+
std::plus<int8_t>{});
150+
matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 0.0, Logical{});
151+
matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, 16,
152+
std::multiplies<int8_t>{});
153+
matrix_verify_op<int8_t, int32_t, 16, 16, 16>(q, MD_i, r, -64,
154+
std::minus<int8_t>{});
155+
}
156+
157+
if (computeCapability >= 8.0) {
158+
159+
matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 0.0,
160+
std::plus<bfloat16>{});
161+
matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 0.0, Logical{});
162+
matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, 16.0,
163+
std::multiplies<bfloat16>{});
164+
matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, -56.0,
165+
std::divides<bfloat16>{});
166+
matrix_verify_op<bfloat16, float, 16, 16, 16>(q, MD_f, r, -64.0,
167+
std::minus<bfloat16>{});
168+
169+
double D_d[MATRIX_M / 2][MATRIX_N / 2];
170+
big_matrix<double, 8 * nWGperDim, 8 * nWGperDim> MD_d((double *)&D_d);
171+
172+
matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -60.0,
173+
std::plus<double>{});
174+
matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -60.0, Logical{});
175+
matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -56.0,
176+
std::multiplies<double>{});
177+
matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -74.0,
178+
std::divides<double>{});
179+
matrix_verify_op<double, double, 8, 4, 8>(q, MD_d, r, -76.0,
180+
std::minus<double>{});
181+
}
182+
183+
return 0;
184+
}

SYCL/Matrix/element_wise_wi_marray.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
//===----------------------------------------------------------------------===//
88
// REQUIRES: cuda
99

10-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out
10+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
1111
// RUN: %t.out
1212

1313
#include <sycl/sycl.hpp>
@@ -30,8 +30,8 @@ template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
3030
[ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
3131
auto sg = spmd_item.get_sub_group();
3232

33-
joint_matrix<T, matrix_use::a, M, K> sub_a;
34-
joint_matrix<T, matrix_use::a, M, K> sub_a_2;
33+
joint_matrix<T, use::a, M, K, layout::row_major> sub_a;
34+
joint_matrix<T, use::a, M, K, layout::row_major> sub_a_2;
3535

3636
joint_matrix_fill(sg, sub_a, -1);
3737
joint_matrix_fill(sg, sub_a_2, -1);
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
//==----------- element_wise_wi_marray.cpp - DPC++ joint_matrix------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: cuda
9+
10+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=3 %s -o %t.out
11+
// RUN: %t.out
12+
13+
#include <sycl/sycl.hpp>
14+
15+
using namespace sycl;
16+
using namespace sycl::ext::oneapi::experimental;
17+
using namespace sycl::ext::oneapi::experimental::matrix;
18+
19+
#define SG_SZ 32
20+
21+
template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
22+
int err = 0;
23+
{
24+
buffer<int> err_buf(&err, 1);
25+
q.submit([&](handler &cgh) {
26+
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
27+
28+
cgh.parallel_for<class marray_kernel>(
29+
nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}),
30+
[ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
31+
auto sg = spmd_item.get_sub_group();
32+
33+
joint_matrix<T, matrix_use::a, M, K> sub_a;
34+
joint_matrix<T, matrix_use::a, M, K> sub_a_2;
35+
36+
joint_matrix_fill(sg, sub_a, -1);
37+
joint_matrix_fill(sg, sub_a_2, -1);
38+
39+
auto wi_slice_a = sub_a.get_wi_data();
40+
for (int i = 0; i < wi_slice_a.length(); i++) {
41+
wi_slice_a[i] = fabs(wi_slice_a[i]);
42+
}
43+
sub_a_2.wi_marray = fabs(sub_a_2.wi_marray);
44+
45+
for (int i = 0; i < sub_a_2.wi_marray.size(); i++) {
46+
if (sub_a_2.wi_marray[i] != wi_slice_a[i]) {
47+
ERR[0] = 1;
48+
}
49+
}
50+
}); // parallel for
51+
}).wait();
52+
}
53+
assert(err == 0);
54+
}
55+
56+
int main() {
57+
58+
queue q;
59+
auto computeCapability =
60+
std::stof(q.get_device().get_info<sycl::info::device::backend_version>());
61+
62+
if (computeCapability >= 8.0) {
63+
verify_wi_marray<bfloat16, 16, 16>(q);
64+
}
65+
66+
return 0;
67+
}

0 commit comments

Comments
 (0)