-
Notifications
You must be signed in to change notification settings - Fork 790
[SYCL][Matrix] Add initial get_coord API. #7037
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
59d0e7e
[SYCL][Matrix] Add initial get_coord API.
135d82b
Reviewers comments
95adb66
clang-format
c7e6000
Using olc_vec type in the spirv operation and creating a tuple using …
7742fd9
clang-format
b4e3ef5
Review comments
57a97cf
Makeaccess through USM, also update the basic kernel with use of bflo…
1c5ace5
clang-format
4529e66
Reviewer comments
643aafc
Clang-format
8e73c9d
More comments addressed.
b2ca8e4
Fixing small error
d2dfda6
Adding XFAIL to test cases when we run. Will take away when the full…
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
// RUN: %clangxx -fsycl -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=2 %s -o %t.out | ||
// RUN: %t.out | ||
// XFAIL: * | ||
|
||
// this code calculates the sum of rows into a global array of number of rows | ||
// elements. First, partial reduction is computed inside each SG, then atomic | ||
// add is used to reduce between SG leaders. The get_coord() API is used for | ||
// retrieving the row | ||
|
||
arnamoy10 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
#include <iostream> | ||
#include <sycl/sycl.hpp> | ||
|
||
using namespace sycl; | ||
using namespace sycl::ext::oneapi::experimental::matrix; | ||
|
||
#define SG_SZ 16 | ||
|
||
#define TN SG_SZ | ||
#define TK 32 | ||
|
||
template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix { | ||
public: | ||
T *mat; | ||
|
||
public: | ||
T *get_data() { return mat; } | ||
void set_data(T *data) { mat = data; } | ||
big_matrix(T *data) : mat(data) {} | ||
}; | ||
|
||
template <typename T, size_t M, size_t N> | ||
void sum_rows_ref( | ||
accessor<T, 2, access::mode::read, access::target::host_buffer> B, | ||
accessor<int, 1, access::mode::read, access::target::host_buffer> | ||
sum_rows) { | ||
int sum_rows_ref[M] = {0}; | ||
for (size_t i = 0; i < M; i++) { | ||
for (size_t j = 0; j < N; j++) { | ||
sum_rows_ref[i] += B[i][j]; | ||
} | ||
auto diff = sum_rows[i] - sum_rows_ref[i]; | ||
assert(std::fabs(static_cast<int>(diff)) <= | ||
std::numeric_limits<int>::epsilon()); | ||
} | ||
} | ||
|
||
template <typename T, size_t M, size_t N> | ||
void matrix_sum_rows(queue q, big_matrix<T, M, N> &B, nd_range<2> &r) { | ||
buffer<int8_t, 2> bufB(B.get_data(), range<2>(M, N)); | ||
// size of vector is known because SG size of set by the user in this case | ||
int sum_rows[M] = {0}; | ||
buffer<int> sum_rows_v(sum_rows, M); // there are total of tK/4 * 2, 16 rows | ||
q.submit([&](handler &cgh) { | ||
auto accB = bufB.get_access<access::mode::read_write>(cgh); | ||
|
||
auto v = sum_rows_v.get_access<access::mode::atomic>(cgh); | ||
|
||
cgh.parallel_for<class add_matrix>( | ||
r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { | ||
const auto global_idx = spmd_item.get_global_id(0); | ||
const auto global_idy = spmd_item.get_global_id(1); | ||
const auto sg_startx = global_idx - spmd_item.get_local_id(0); | ||
const auto sg_starty = global_idy - spmd_item.get_local_id(1); | ||
|
||
ext::oneapi::sub_group sg = spmd_item.get_sub_group(); | ||
|
||
joint_matrix<T, TK, TN, use::b> sub_b(sg); | ||
|
||
joint_matrix_load(sg, sub_b, | ||
accB.get_pointer() + (global_idx * (TK / 4) * N) + | ||
sg_starty / SG_SZ * TN * 4, | ||
N, layout::packed_b); | ||
|
||
int32_t sum_local_rows[M] = {0}; | ||
auto tBData = sub_b.get_wi_data(); | ||
|
||
// each WI calculates local sum of rows | ||
for (int i = 0; i < tBData.length(); ++i) { | ||
// row and col holds global co_ordinates of the matrix | ||
auto [row, col] = tBData[i].get_coord(); | ||
sum_local_rows[row] += tBData[i]; | ||
|
||
sum_local_rows[row] = | ||
reduce_over_group(sg, sum_local_rows[row], sycl::plus<>()); | ||
// only Groups leader perform the global reduction | ||
if (global_idy % SG_SZ == 0) { | ||
atomic_fetch_add(v[row], sum_local_rows[row]); | ||
} | ||
} | ||
}); // parallel for | ||
}).wait(); | ||
sum_rows_ref<T, M, N>(bufB.get_access<access::mode::read>(), | ||
sum_rows_v.get_access<access::mode::read>()); | ||
} | ||
|
||
static constexpr size_t MATRIX_K = TK / 4 * 2; | ||
static constexpr size_t MATRIX_N = TN * 4 * 2; | ||
int8_t B[MATRIX_K][MATRIX_N]; | ||
|
||
int main() { | ||
big_matrix<int8_t, MATRIX_K, MATRIX_N> MB((int8_t *)&B); | ||
|
||
size_t NDRangeK = MATRIX_K / (TK / 4); | ||
size_t NDRangeN = (MATRIX_N / 4) / TN; | ||
queue q; | ||
nd_range<2> r({NDRangeK, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}); | ||
|
||
for (int i = 0; i < MATRIX_K; i++) { | ||
for (int j = 0; j < MATRIX_N; j++) { | ||
B[i][j] = i; | ||
} | ||
} | ||
|
||
matrix_sum_rows<int8_t, MATRIX_K, MATRIX_N>(q, MB, r); | ||
|
||
return 0; | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.