Skip to content

Commit ec23228

Browse files
dkhaldibb-sycl
authored andcommitted
[SYCL][matrix] add a new test for irregular slicing on packed matrix that calculates sum of rows of the matrix (intel#899)
1 parent 20422bb commit ec23228

File tree

1 file changed

+131
-0
lines changed

1 file changed

+131
-0
lines changed
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
//==-------- element_wise_irreg_sum_rows.cpp - DPC++ joint_matrix----- ----==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: matrix
9+
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
12+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
13+
14+
// this code calculates the sum of rows into a global array of number of rows
15+
// elements. First, partial reduction is computed inside each SG, then atomic
16+
// add is used to reduce between SG leaders
17+
18+
#include <CL/sycl.hpp>
19+
#include <iostream>
20+
21+
using namespace sycl;
22+
using namespace sycl::ext::oneapi::experimental::matrix;
23+
24+
#define SG_SZ 8
25+
26+
#define TN SG_SZ
27+
#define TK 32
28+
29+
template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
30+
public:
31+
T *mat;
32+
33+
public:
34+
T *get_data() { return mat; }
35+
void set_data(T *data) { mat = data; }
36+
big_matrix(T *data) : mat(data) {}
37+
};
38+
39+
template <typename T, size_t M, size_t N>
40+
void sum_rows_ref(
41+
accessor<T, 2, access::mode::read, access::target::host_buffer> B,
42+
accessor<int, 1, access::mode::read, access::target::host_buffer>
43+
sum_rows) {
44+
int sum_rows_ref[M] = {0};
45+
for (size_t i = 0; i < M; i++) {
46+
for (size_t j = 0; j < N; j++) {
47+
sum_rows_ref[i] += B[i][j];
48+
}
49+
auto diff = sum_rows[i] - sum_rows_ref[i];
50+
assert(std::fabs(static_cast<int>(diff)) <=
51+
std::numeric_limits<int>::epsilon());
52+
}
53+
}
54+
55+
template <typename T, size_t M, size_t N>
56+
void matrix_sum_rows(queue q, big_matrix<T, M, N> &B, nd_range<2> &r) {
57+
buffer<int8_t, 2> bufB(B.get_data(), range<2>(M, N));
58+
// size of vector is known because SG size of set by the user in this case
59+
int sum_rows[M] = {0};
60+
buffer<int> sum_rows_v(sum_rows, M); // there are total of tK/4 * 2, 16 rows
61+
q.submit([&](handler &cgh) {
62+
auto accB = bufB.get_access<access::mode::read_write>(cgh);
63+
64+
auto v = sum_rows_v.get_access<access::mode::atomic>(cgh);
65+
66+
cgh.parallel_for<class add_matrix>(
67+
r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
68+
const auto global_idx = spmd_item.get_global_id(0);
69+
const auto global_idy = spmd_item.get_global_id(1);
70+
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
71+
const auto sg_starty = global_idy - spmd_item.get_local_id(1);
72+
73+
ext::oneapi::sub_group sg = spmd_item.get_sub_group();
74+
75+
joint_matrix<T, TK, TN, matrix_layout::packed_b> sub_b(sg);
76+
77+
joint_matrix_load(sg, sub_b,
78+
accB.get_pointer() + (global_idx * (TK / 4) * N) +
79+
sg_starty / SG_SZ * TN * 4,
80+
N, matrix_layout::packed_b);
81+
// calculate sum of rows in sum_rows_v[8], there are 8 rows in sub_b
82+
// (tK/4)
83+
int32_t sum_local_rows[M] = {0}; // 8 local rows, M total
84+
// sub_b has 32x8 elements, 32 elements per WI, 4 per WI per row
85+
auto data = sub_b.get_wi_data();
86+
87+
// each WI calculates local sum of rows
88+
for (int row = 0; row < TK / 4; row++) { // there are 8 rows
89+
for (int i = 0; i < data.length() / (TK / 4); i++) { // 4 per row
90+
// i*SG_SIZE index is found based on the round robin
91+
// distribution we are using in the implementation
92+
sum_local_rows[row + global_idx * (TK / 4)] += data[i + row * 4]
93+
}
94+
sum_local_rows[row + global_idx * (TK / 4)] = reduce_over_group(
95+
sg, sum_local_rows[row + global_idx * (TK / 4)],
96+
sycl::plus<>());
97+
98+
// only Groups leader perform the global reduction
99+
if (global_idy % 8 == 0) {
100+
atomic_fetch_add(v[row + global_idx * (TK / 4)],
101+
sum_local_rows[row + global_idx * (TK / 4)]);
102+
}
103+
}
104+
}); // parallel for
105+
}).wait();
106+
sum_rows_ref<T, M, N>(bufB.get_access<access::mode::read>(),
107+
sum_rows_v.get_access<access::mode::read>());
108+
}
109+
110+
static constexpr size_t MATRIX_K = TK / 4 * 2;
111+
static constexpr size_t MATRIX_N = TN * 4 * 2;
112+
int8_t B[MATRIX_K][MATRIX_N];
113+
114+
int main() {
115+
big_matrix<int8_t, MATRIX_K, MATRIX_N> MB((int8_t *)&B);
116+
117+
size_t NDRangeK = MATRIX_K / (TK / 4);
118+
size_t NDRangeN = (MATRIX_N / 4) / TN;
119+
queue q;
120+
nd_range<2> r({NDRangeK, NDRangeN * SG_SZ}, {1, 1 * SG_SZ});
121+
122+
for (int i = 0; i < MATRIX_K; i++) {
123+
for (int j = 0; j < MATRIX_N; j++) {
124+
B[i][j] = i;
125+
}
126+
}
127+
128+
matrix_sum_rows<int8_t, MATRIX_K, MATRIX_N>(q, MB, r);
129+
130+
return 0;
131+
}

0 commit comments

Comments
 (0)