Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 6e868ff

Browse files
authored
[SYCL][Matrix] test the two features: fill a matrix and element wise operations (#645)
Two new matrix features are being added to the DPC++ compiler namely: fill a matrix and element wise operations. This PR adds tests for these two features. Signed-off-by: Dounia Khaldi [email protected]
1 parent e9625da commit 6e868ff

File tree

3 files changed

+441
-6
lines changed

3 files changed

+441
-6
lines changed
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
//==----------- element_wise_all_ops_half.cpp - DPC++ joint_matrix---------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: matrix
9+
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
12+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
13+
14+
// There is a known bug in joint_matrix_fill when type is half
15+
// A PR is being developed to fix the bug
16+
// Will remove the XFAIL once this is fixed
17+
// XFAIL: *
18+
19+
#include <CL/sycl.hpp>
20+
#include <iostream>
21+
#include <random>
22+
23+
using namespace sycl;
24+
using namespace sycl::ext::intel;
25+
using namespace sycl::ext::oneapi::experimental::matrix;
26+
27+
#define SG_SZ 8
28+
29+
#define TM 8
30+
#define TN SG_SZ
31+
#define TK 16
32+
33+
template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
34+
public:
35+
T *mat;
36+
37+
public:
38+
T *get_data() { return mat; }
39+
void set_data(T *data) { mat = data; }
40+
big_matrix(T *data) : mat(data) {}
41+
};
42+
43+
template <typename T, size_t M, size_t N>
44+
void assert_ops_ref(/*const T &C*/ accessor<T, 2, access::mode::read,
45+
access::target::host_buffer>
46+
C,
47+
const float ref) {
48+
for (size_t i = 0; i < M; i++)
49+
for (size_t j = 0; j < N; j++) {
50+
auto diff = C[i][j] - ref;
51+
assert(std::fabs(static_cast<float>(diff)) <
52+
std::numeric_limits<float>::epsilon());
53+
}
54+
}
55+
template <typename T, size_t M, size_t N>
56+
void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
57+
const float ref) {
58+
buffer<half, 2> bufA(A.get_data(), range<2>(M, N));
59+
60+
q.submit([&](handler &cgh) {
61+
auto accA = bufA.get_access<access::mode::read_write>(cgh);
62+
63+
cgh.parallel_for<class imatrix>(r, [accA](nd_item<2> spmd_item) {
64+
const auto global_idx = spmd_item.get_global_id(0);
65+
const auto global_idy = spmd_item.get_global_id(1);
66+
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
67+
const auto sg_starty = global_idy - spmd_item.get_local_id(1);
68+
69+
ext::oneapi::sub_group sg = spmd_item.get_sub_group();
70+
joint_matrix<T, TM, TK> sub_a(sg);
71+
72+
joint_matrix_fill(sg, sub_a, 5.0);
73+
74+
auto wi_slice_a = sub_a.get_wi_data();
75+
for (int i = 0; i < wi_slice_a.length(); i++) {
76+
wi_slice_a[i] = wi_slice_a[i] + 2;
77+
}
78+
joint_matrix_store(sg, sub_a,
79+
accA.get_pointer() + (sg_startx * TM) * N +
80+
sg_starty / SG_SZ * TN,
81+
N, matrix_layout::row_major);
82+
}); // parallel for
83+
}).wait();
84+
assert_ops_ref<T, M, N>(bufA.get_access<access::mode::read>(), ref);
85+
}
86+
87+
template <typename T, size_t M, size_t N>
88+
void matrix_verify_sub(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
89+
const float ref) {
90+
buffer<half, 2> bufA(A.get_data(), range<2>(M, N));
91+
92+
q.submit([&](handler &cgh) {
93+
auto accA = bufA.get_access<access::mode::read_write>(cgh);
94+
95+
cgh.parallel_for<class imatrix>(r, [accA](nd_item<2> spmd_item) {
96+
const auto global_idx = spmd_item.get_global_id(0);
97+
const auto global_idy = spmd_item.get_global_id(1);
98+
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
99+
const auto sg_starty = global_idy - spmd_item.get_local_id(1);
100+
101+
ext::oneapi::sub_group sg = spmd_item.get_sub_group();
102+
joint_matrix<T, TM, TK> sub_a(sg);
103+
104+
joint_matrix_fill(sg, sub_a, 5.0);
105+
106+
auto wi_slice_a = sub_a.get_wi_data();
107+
for (int i = 0; i < wi_slice_a.length(); i++) {
108+
wi_slice_a[i] = wi_slice_a[i] - 2;
109+
}
110+
joint_matrix_store(sg, sub_a,
111+
accA.get_pointer() + (sg_startx * TM) * N +
112+
sg_starty / SG_SZ * TN,
113+
N, matrix_layout::row_major);
114+
}); // parallel for
115+
}).wait();
116+
assert_ops_ref<T, M, N>(bufA.get_access<access::mode::read>(), ref);
117+
}
118+
119+
template <typename T, size_t M, size_t N>
120+
void matrix_verify_mul(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
121+
const float ref) {
122+
buffer<half, 2> bufA(A.get_data(), range<2>(M, N));
123+
124+
q.submit([&](handler &cgh) {
125+
auto accA = bufA.get_access<access::mode::read_write>(cgh);
126+
127+
cgh.parallel_for<class imatrix>(r, [accA](nd_item<2> spmd_item) {
128+
const auto global_idx = spmd_item.get_global_id(0);
129+
const auto global_idy = spmd_item.get_global_id(1);
130+
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
131+
const auto sg_starty = global_idy - spmd_item.get_local_id(1);
132+
133+
ext::oneapi::sub_group sg = spmd_item.get_sub_group();
134+
joint_matrix<T, TM, TK> sub_a(sg);
135+
136+
joint_matrix_fill(sg, sub_a, 5.0);
137+
138+
auto wi_slice_a = sub_a.get_wi_data();
139+
for (int i = 0; i < wi_slice_a.length(); i++) {
140+
wi_slice_a[i] = wi_slice_a[i] * 3.0;
141+
}
142+
joint_matrix_store(sg, sub_a,
143+
accA.get_pointer() + (sg_startx * TM) * N +
144+
sg_starty / SG_SZ * TN,
145+
N, matrix_layout::row_major);
146+
}); // parallel for
147+
}).wait();
148+
assert_ops_ref<T, M, N>(bufA.get_access<access::mode::read>(), ref);
149+
}
150+
151+
template <typename T, size_t M, size_t N>
152+
void matrix_verify_div(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
153+
const float ref) {
154+
buffer<half, 2> bufA(A.get_data(), range<2>(M, N));
155+
156+
q.submit([&](handler &cgh) {
157+
auto accA = bufA.get_access<access::mode::read_write>(cgh);
158+
159+
cgh.parallel_for<class imatrix>(r, [accA](nd_item<2> spmd_item) {
160+
const auto global_idx = spmd_item.get_global_id(0);
161+
const auto global_idy = spmd_item.get_global_id(1);
162+
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
163+
const auto sg_starty = global_idy - spmd_item.get_local_id(1);
164+
165+
ext::oneapi::sub_group sg = spmd_item.get_sub_group();
166+
joint_matrix<T, TM, TK> sub_a(sg);
167+
168+
joint_matrix_fill(sg, sub_a, 4.0);
169+
170+
auto wi_slice_a = sub_a.get_wi_data();
171+
for (int i = 0; i < wi_slice_a.length(); i++) {
172+
wi_slice_a[i] = wi_slice_a[i] / 2.0;
173+
}
174+
joint_matrix_store(sg, sub_a,
175+
accA.get_pointer() + (sg_startx * TM) * N +
176+
sg_starty / SG_SZ * TN,
177+
N, matrix_layout::row_major);
178+
}); // parallel for
179+
}).wait();
180+
assert_ops_ref<T, M, N>(bufA.get_access<access::mode::read>(), ref);
181+
}
182+
183+
template <typename T, size_t M, size_t N>
184+
void matrix_verify_logic(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
185+
const float ref) {
186+
buffer<half, 2> bufA(A.get_data(), range<2>(M, N));
187+
188+
q.submit([&](handler &cgh) {
189+
auto accA = bufA.get_access<access::mode::read_write>(cgh);
190+
191+
cgh.parallel_for<class imatrix>(r, [accA](nd_item<2> spmd_item) {
192+
const auto global_idx = spmd_item.get_global_id(0);
193+
const auto global_idy = spmd_item.get_global_id(1);
194+
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
195+
const auto sg_starty = global_idy - spmd_item.get_local_id(1);
196+
197+
ext::oneapi::sub_group sg = spmd_item.get_sub_group();
198+
joint_matrix<T, TM, TK> sub_a(sg);
199+
200+
joint_matrix_fill(sg, sub_a, 5.0);
201+
202+
auto wi_slice_a = sub_a.get_wi_data();
203+
for (int i = 0; i < wi_slice_a.length(); i++) {
204+
if (wi_slice_a[i]) {
205+
if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 2.0 ||
206+
wi_slice_a[i] < 2.0 || wi_slice_a[i] <= 2.0) {
207+
T val = (wi_slice_a[i] != 2.0) ? wi_slice_a[i] : 2.0;
208+
val--;
209+
val++;
210+
if (wi_slice_a[i] == 2.0) {
211+
val -= 2;
212+
val *= 3.0;
213+
val /= 2.0;
214+
} else {
215+
val += 2;
216+
}
217+
wi_slice_a[i] = val;
218+
}
219+
}
220+
}
221+
joint_matrix_store(sg, sub_a,
222+
accA.get_pointer() + (sg_startx * TM) * N +
223+
sg_starty / SG_SZ * TN,
224+
N, matrix_layout::row_major);
225+
}); // parallel for
226+
}).wait();
227+
assert_ops_ref<T, M, N>(bufA.get_access<access::mode::read>(), ref);
228+
}
229+
230+
static constexpr size_t MATRIX_M = TM * 2;
231+
static constexpr size_t MATRIX_N = TN * 2;
232+
half A[MATRIX_M][MATRIX_N];
233+
float D[MATRIX_M][MATRIX_N];
234+
235+
void matrix_ops_ref(float *D, int M, int N) {
236+
for (int m = 0; m < M; m++)
237+
for (int n = 0; n < N; n++) {
238+
*(D + m * N + n) = 0;
239+
*(D + m * N + n) *= 2;
240+
}
241+
}
242+
243+
int main() {
244+
245+
big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
246+
big_matrix<half, MATRIX_M, MATRIX_N> MA((half *)&A);
247+
248+
size_t NDRangeM = MATRIX_M / TM;
249+
size_t NDRangeN = MATRIX_N / TN;
250+
queue q;
251+
nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ});
252+
253+
matrix_verify_add<half, MATRIX_M, MATRIX_N>(q, MA, r, 7.0);
254+
matrix_verify_sub<half, MATRIX_M, MATRIX_N>(q, MA, r, 3.0);
255+
matrix_verify_mul<half, MATRIX_M, MATRIX_N>(q, MA, r, 15.0);
256+
matrix_verify_div<half, MATRIX_M, MATRIX_N>(q, MA, r, 2.0);
257+
matrix_verify_logic<half, MATRIX_M, MATRIX_N>(q, MA, r, 7.0);
258+
259+
return 0;
260+
}

0 commit comments

Comments
 (0)