Skip to content

Commit f6b2cb3

Browse files
author
mmoadeli
committed
* Update matrix core support into joint_matrix documentation.
* Use fully qualified names. * Add diagnostic to tell the user that joint_maitrx is only supported for gfx90a. * Remove unnecessary `else` conditions. * Merge HIP matrix cpp files.
1 parent b5c0911 commit f6b2cb3

13 files changed

+179
-194
lines changed

sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ specification.*
5050
This extension is currently implemented in {dpcpp} only for devices
5151
that contain a matrix hardware, specifically Intel(R) Advanced Matrix
5252
Extensions (Intel(R) AMX), Intel(R) Xe Matrix Extensions (Intel(R)
53-
XMX), Nvidia(R) Tensor Cores and AMD Matrix Cores.
53+
XMX), Nvidia(R) Tensor Cores and AMD Matrix Cores(R).
5454

5555
The `joint_matrix` type and the `joint_matrix_mad` function are
5656
optional kernel features as defined in section 5.7 of the core SYCL
@@ -68,7 +68,7 @@ implementation throws a synchronous exception with the
6868
== Overview
6969
Joint matrix is a SYCL extension for matrix hardware programming. It
7070
unifies targets like Intel AMX in CPUs, Intel XMX in Intel GPUs,
71-
Nvidia Tensor Cores and AMD Matrix Cores. This provides a portable and performant API for
71+
Nvidia Tensor Cores and AMD Matrix Cores(R). This provides a portable and performant API for
7272
users who want to build their own neural networks applications,
7373
perform custom optimizations, or experiment with new operations in a
7474
timely and performing manner.
@@ -922,7 +922,8 @@ matrix. Also, the type of the C matrix must be the same as the type of the D
922922
matrix.
923923

924924
IMPORTANT: When compiling for the `ext_oneapi_cuda` backend the target
925-
arch backend flag, `-fsycl-targets=nvidia_gpu_sm_xx`, must
925+
arch backend flag, `-fsycl-targets=nvidia_gpu_sm_xx`
926+
(or equivalents, e.g. `-Xsycl-target-backend --cuda-gpu-arch=sm_xx`), must
926927
be used, where `sm_xx` must be a Compute Capability that is equal to
927928
or greater than the appropriate Minimum Compute Capability. When an
928929
executable has been compiled for `sm_xx`, if the executable is run on
@@ -965,15 +966,14 @@ multiple of 4 when `T` is `float`; where `T` is the type of the
965966
no restrictions to `stride`.
966967

967968
==== AMD Matrix Cores Supported Combinations
968-
The complete set of matrix data types and dimenstions that are supported by
969+
The complete set of matrix data types and dimensions that are supported by
969970
the `ext_oneapi_hip` backend are represented in the following
970971
table. In this architecture's implementation, A and B matrices must have the same type.
971972
Similarly, C and D matrices must share the same type.
972973

973-
IMPORTANT: Currently, only one block AMD Matrix Core instructions in
974-
GFX90A (MI200, MI210, MI250 and MI250X GPUs) architecture are supported.
975-
When compiling for the `ext_oneapi_hip` backend the target arch backend flag,
976-
`-fsycl-targets=amd_gpu_gfx90a`, must
974+
IMPORTANT: The supported instructions may be run on GFX90A (MI200, MI210, MI250 and MI250X GPUs)
975+
architecture. When compiling for the `ext_oneapi_hip` backend the
976+
target arch backend flag, `-fsycl-targets=amd_gpu_gfx90a`, must
977977
be used. An attempt to run the compiled code on an unsupported architecture will throw an error.
978978

979979

@@ -983,7 +983,7 @@ be used. An attempt to run the compiled code on an unsupported architecture will
983983
.2+| `matrix_type::fp16` .2+| `matrix_type::fp32`
984984
|32 |32 |8
985985
|16 |16 |16
986-
.2+| `matrix_type::int8` .2+| `matrix_type::int32`
986+
.2+| `matrix_type::sint8` .2+| `matrix_type::sint32`
987987
|32 |32 |8
988988
|16 |16 |16
989989
.2+|`matrix_type::bf16` .2+|`matrix_type::fp32`

sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp

Lines changed: 116 additions & 76 deletions
Large diffs are not rendered by default.

sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,8 @@ struct joint_matrix {
5252
T, Rows, Cols, spv_matrix_layout_traits<Layout>::value,
5353
spv_scope_traits<Group>::value, spv_matrix_use_traits<Use>::value> *spvm;
5454
#else
55-
static_assert(
56-
false,
57-
"The joint_matrix API is only supported by the Intel and CUDA backends");
55+
static_assert(false, "The joint_matrix API is only supported by the Intel, "
56+
"CUDA and HIP (GFX90A) backends");
5857
#endif // defined(__NVPTX__)
5958
#endif // defined(__SYCL_DEVICE_ONLY__)
6059

sycl/test-e2e/Matrix/joint_matrix_hip_apply.cpp

Lines changed: 0 additions & 14 deletions
This file was deleted.

sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ using namespace sycl::ext::oneapi::experimental::matrix;
99
using sycl::ext::oneapi::bfloat16;
1010

1111
template <typename InType, typename OutType, size_t M, size_t N, size_t K>
12-
void hip_matrix_mfma() {
12+
void hip_matrix_apply() {
1313
InType A[M * K];
1414
InType B[K * N];
1515
OutType C[M * N];
@@ -55,29 +55,19 @@ void hip_matrix_mfma() {
5555
joint_matrix<sub_group, InType, use::a, M, K, layout::col_major>
5656
sub_a{};
5757

58-
joint_matrix_load(
59-
sg, sub_a,
60-
accA.template get_multi_ptr<access::decorated::yes>(), K);
61-
62-
joint_matrix_load(
63-
sg, sub_b,
64-
accB.template get_multi_ptr<access::decorated::yes>(), N);
65-
66-
joint_matrix_load(
67-
sg, sub_c,
68-
accC.template get_multi_ptr<access::decorated::yes>(), N,
69-
layout::row_major);
58+
joint_matrix_load(sg, sub_a, accA.template get_multi_ptr(), K);
59+
joint_matrix_load(sg, sub_b, accB.template get_multi_ptr(), N);
60+
joint_matrix_load(sg, sub_c, accC.template get_multi_ptr(), N,
61+
layout::row_major);
7062

7163
joint_matrix_apply(sg, sub_a, [=](InType v) { return v * 2; });
7264
joint_matrix_apply(sg, sub_b, [=](InType v) { return v * 3; });
7365
joint_matrix_apply(sg, sub_c, [=](OutType v) { return v * 4; });
7466

7567
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
7668

77-
joint_matrix_store(
78-
sg, sub_c,
79-
accD.template get_multi_ptr<access::decorated::yes>(), N,
80-
layout::row_major);
69+
joint_matrix_store(sg, sub_c, accD.template get_multi_ptr(), N,
70+
layout::row_major);
8171
});
8272
})
8373
.wait();

sycl/test-e2e/Matrix/joint_matrix_hip_fill.cpp

Lines changed: 0 additions & 14 deletions
This file was deleted.

sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ using namespace sycl::ext::oneapi::experimental::matrix;
99
using sycl::ext::oneapi::bfloat16;
1010

1111
template <typename InType, typename OutType, size_t M, size_t N, size_t K>
12-
void hip_matrix_mfma() {
12+
void hip_matrix_fill() {
1313
InType A[M * K];
1414
InType B[K * N];
1515
OutType C[M * N];
@@ -61,10 +61,8 @@ void hip_matrix_mfma() {
6161

6262
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
6363

64-
joint_matrix_store(
65-
sg, sub_c,
66-
accD.template get_multi_ptr<access::decorated::yes>(), N,
67-
layout::row_major);
64+
joint_matrix_store(sg, sub_c, accD.template get_multi_ptr(), N,
65+
layout::row_major);
6866
});
6967
})
7068
.wait();

sycl/test-e2e/Matrix/joint_matrix_hip_mfma.cpp renamed to sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
// REQUIRES: gpu-amd-gfx90a
55

6+
#include "joint_matrix_hip_apply.hpp"
7+
#include "joint_matrix_hip_fill.hpp"
68
#include "joint_matrix_hip_mfma.hpp"
79

810
int main() {
@@ -11,10 +13,21 @@ int main() {
1113
hip_matrix_mfma<bfloat16, float, 32, 32, 8, layout::row_major>();
1214
hip_matrix_mfma<bfloat16, float, 16, 16, 16, layout::row_major>();
1315
hip_matrix_mfma<double, double, 16, 16, 4, layout::row_major>();
14-
1516
hip_matrix_mfma<int8_t, int32_t, 32, 32, 8, layout::col_major>();
1617
hip_matrix_mfma<int8_t, int32_t, 16, 16, 16, layout::col_major>();
1718
hip_matrix_mfma<bfloat16, float, 32, 32, 8, layout::col_major>();
1819
hip_matrix_mfma<bfloat16, float, 16, 16, 16, layout::col_major>();
1920
hip_matrix_mfma<double, double, 16, 16, 4, layout::col_major>();
21+
22+
hip_matrix_fill<int8_t, int32_t, 32, 32, 8>();
23+
hip_matrix_fill<int8_t, int32_t, 16, 16, 16>();
24+
hip_matrix_fill<bfloat16, float, 32, 32, 8>();
25+
hip_matrix_fill<bfloat16, float, 16, 16, 16>();
26+
hip_matrix_fill<double, double, 16, 16, 4>();
27+
28+
hip_matrix_apply<int8_t, int32_t, 32, 32, 8>();
29+
hip_matrix_apply<int8_t, int32_t, 16, 16, 16>();
30+
hip_matrix_apply<bfloat16, float, 32, 32, 8>();
31+
hip_matrix_apply<bfloat16, float, 16, 16, 16>();
32+
hip_matrix_apply<double, double, 16, 16, 4>();
2033
}

sycl/test-e2e/Matrix/joint_matrix_hip_half_apply.cpp

Lines changed: 0 additions & 12 deletions
This file was deleted.

sycl/test-e2e/Matrix/joint_matrix_hip_half_fill.cpp

Lines changed: 0 additions & 12 deletions
This file was deleted.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// RUN: %{build} -fsycl -fsycl-targets=amd_gpu_gfx90a -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
// REQUIRES: gpu-amd-gfx90a
5+
// REQUIRES: aspect-fp16
6+
7+
#include "joint_matrix_hip_apply.hpp"
8+
#include "joint_matrix_hip_fill.hpp"
9+
#include "joint_matrix_hip_mfma.hpp"
10+
11+
int main() {
12+
hip_matrix_fill<sycl::half, float, 32, 32, 8, layout::row_major>();
13+
hip_matrix_fill<sycl::half, float, 16, 16, 16, layout::row_major>();
14+
hip_matrix_fill<sycl::half, float, 32, 32, 8, layout::col_major>();
15+
hip_matrix_fill<sycl::half, float, 16, 16, 16, layout::col_major>();
16+
17+
hip_matrix_fill<sycl::half, float, 32, 32, 8>();
18+
hip_matrix_fill<sycl::half, float, 16, 16, 16>();
19+
20+
hip_matrix_apply<sycl::half, float, 32, 32, 8>();
21+
hip_matrix_apply<sycl::half, float, 16, 16, 16>();
22+
}

sycl/test-e2e/Matrix/joint_matrix_hip_half_mfma.cpp

Lines changed: 0 additions & 15 deletions
This file was deleted.

sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -71,25 +71,15 @@ void hip_matrix_mfma() {
7171
joint_matrix<sub_group, InType, use::a, M, K, layout::col_major>
7272
sub_a{};
7373

74-
joint_matrix_load(
75-
sg, sub_a,
76-
accA.template get_multi_ptr<access::decorated::yes>(), K);
77-
78-
joint_matrix_load(
79-
sg, sub_b,
80-
accB.template get_multi_ptr<access::decorated::yes>(), N);
81-
82-
joint_matrix_load(
83-
sg, sub_c,
84-
accC.template get_multi_ptr<access::decorated::yes>(), N,
85-
layout::row_major);
74+
joint_matrix_load(sg, sub_a, accA.template get_multi_ptr(), K);
75+
joint_matrix_load(sg, sub_b, accB.template get_multi_ptr(), N);
76+
joint_matrix_load(sg, sub_c, accC.template get_multi_ptr(), N,
77+
layout::row_major);
8678

8779
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
8880

89-
joint_matrix_store(
90-
sg, sub_c,
91-
accD.template get_multi_ptr<access::decorated::yes>(), N,
92-
OutLayout);
81+
joint_matrix_store(sg, sub_c, accD.template get_multi_ptr(), N,
82+
OutLayout);
9383
});
9484
})
9585
.wait();

0 commit comments

Comments
 (0)