intel
diff --git a/‎sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
Lines changed: 9 additions & 9 deletions b/‎sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
Lines changed: 9 additions & 9 deletions
diff --git a/‎sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp
Lines changed: 116 additions & 76 deletions b/‎sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp
Lines changed: 116 additions & 76 deletions
diff --git a/‎sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
Lines changed: 2 additions & 3 deletions b/‎sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_apply.cpp
Lines changed: 0 additions & 14 deletions b/‎sycl/test-e2e/Matrix/joint_matrix_hip_apply.cpp
Lines changed: 0 additions & 14 deletions
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp
Lines changed: 7 additions & 17 deletions b/‎sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp
Lines changed: 7 additions & 17 deletions
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_fill.cpp
Lines changed: 0 additions & 14 deletions b/‎sycl/test-e2e/Matrix/joint_matrix_hip_fill.cpp
Lines changed: 0 additions & 14 deletions
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp
Lines changed: 3 additions & 5 deletions b/‎sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp
Lines changed: 3 additions & 5 deletions
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_mfma.cpp renamed to ‎sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp
Lines changed: 14 additions & 1 deletion b/‎sycl/test-e2e/Matrix/joint_matrix_hip_mfma.cpp renamed to ‎sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp
Lines changed: 14 additions & 1 deletion
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_half_apply.cpp
Lines changed: 0 additions & 12 deletions b/‎sycl/test-e2e/Matrix/joint_matrix_hip_half_apply.cpp
Lines changed: 0 additions & 12 deletions
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_half_fill.cpp
Lines changed: 0 additions & 12 deletions b/‎sycl/test-e2e/Matrix/joint_matrix_hip_half_fill.cpp
Lines changed: 0 additions & 12 deletions
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp
Lines changed: 22 additions & 0 deletions b/‎sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp
Lines changed: 22 additions & 0 deletions
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_half_mfma.cpp
Lines changed: 0 additions & 15 deletions b/‎sycl/test-e2e/Matrix/joint_matrix_hip_half_mfma.cpp
Lines changed: 0 additions & 15 deletions
diff --git a/‎sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp
Lines changed: 6 additions & 16 deletions b/‎sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp
Lines changed: 6 additions & 16 deletions
@@ -50,7 +50,7 @@ specification.*
 This extension is currently implemented in {dpcpp} only for devices
 that contain a matrix hardware, specifically Intel(R) Advanced Matrix
 Extensions (Intel(R) AMX), Intel(R) Xe Matrix Extensions (Intel(R)
-XMX), Nvidia(R) Tensor Cores and AMD Matrix Cores.
+XMX), Nvidia(R) Tensor Cores and AMD Matrix Cores(R).
 
 The `joint_matrix` type and the `joint_matrix_mad` function are
 optional kernel features as defined in section 5.7 of the core SYCL
@@ -68,7 +68,7 @@ implementation throws a synchronous exception with the
 == Overview
 Joint matrix is a SYCL extension for matrix hardware programming. It
 unifies targets like Intel AMX in CPUs, Intel XMX in Intel GPUs,
-Nvidia Tensor Cores and AMD Matrix Cores. This provides a portable and performant API for
+Nvidia Tensor Cores and AMD Matrix Cores(R). This provides a portable and performant API for
 users who want to build their own neural networks applications,
 perform custom optimizations, or experiment with new operations in a
 timely and performing manner.
@@ -922,7 +922,8 @@ matrix. Also, the type of the C matrix must be the same as the type of the D
 matrix.
 
 IMPORTANT: When compiling for the `ext_oneapi_cuda` backend the target
-arch backend flag, `-fsycl-targets=nvidia_gpu_sm_xx`, must
+arch backend flag, `-fsycl-targets=nvidia_gpu_sm_xx`
+(or equivalents, e.g. `-Xsycl-target-backend --cuda-gpu-arch=sm_xx`), must
 be used, where `sm_xx` must be a Compute Capability that is equal to
 or greater than the appropriate Minimum Compute Capability. When an
 executable has been compiled for `sm_xx`, if the executable is run on
@@ -965,15 +966,14 @@ multiple of 4 when `T` is `float`; where `T` is the type of the
 no restrictions to `stride`.
 
 ==== AMD Matrix Cores Supported Combinations
-The complete set of matrix data types and dimenstions that are supported by
+The complete set of matrix data types and dimensions that are supported by
 the `ext_oneapi_hip` backend are represented in the following
 table. In this architecture's implementation, A and B matrices must have the same type. 
 Similarly, C and D matrices must share the same type.
 
-IMPORTANT: Currently, only one block AMD Matrix Core instructions in 
-GFX90A (MI200, MI210, MI250 and MI250X GPUs) architecture are supported.
-When compiling for the `ext_oneapi_hip` backend the target arch backend flag,
- `-fsycl-targets=amd_gpu_gfx90a`, must
+IMPORTANT: The supported instructions may be run on GFX90A (MI200, MI210, MI250 and MI250X GPUs)
+architecture. When compiling for the `ext_oneapi_hip` backend the 
+target arch backend flag, `-fsycl-targets=amd_gpu_gfx90a`, must
 be used. An attempt to run the compiled code on an unsupported architecture will throw an error. 
 
 
@@ -983,7 +983,7 @@ be used. An attempt to run the compiled code on an unsupported architecture will
 .2+| `matrix_type::fp16`  .2+| `matrix_type::fp32`
 |32 |32 |8 
 |16 |16 |16
-.2+| `matrix_type::int8`  .2+| `matrix_type::int32`
+.2+| `matrix_type::sint8`  .2+| `matrix_type::sint32`
 |32 |32 |8 
 |16 |16 |16
 .2+|`matrix_type::bf16`  .2+|`matrix_type::fp32`
 
@@ -52,9 +52,8 @@ struct joint_matrix {
       T, Rows, Cols, spv_matrix_layout_traits<Layout>::value,
       spv_scope_traits<Group>::value, spv_matrix_use_traits<Use>::value> *spvm;
 #else
-  static_assert(
-      false,
-      "The joint_matrix API is only supported by the Intel and CUDA backends");
+  static_assert(false, "The joint_matrix API is only supported by the Intel, "
+                       "CUDA and HIP (GFX90A) backends");
 #endif // defined(__NVPTX__)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
 
@@ -9,7 +9,7 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 using sycl::ext::oneapi::bfloat16;
 
 template <typename InType, typename OutType, size_t M, size_t N, size_t K>
-void hip_matrix_mfma() {
+void hip_matrix_apply() {
   InType A[M * K];
   InType B[K * N];
   OutType C[M * N];
@@ -55,29 +55,19 @@ void hip_matrix_mfma() {
                 joint_matrix<sub_group, InType, use::a, M, K, layout::col_major>
                     sub_a{};
 
-                joint_matrix_load(
-                    sg, sub_a,
-                    accA.template get_multi_ptr<access::decorated::yes>(), K);
-
-                joint_matrix_load(
-                    sg, sub_b,
-                    accB.template get_multi_ptr<access::decorated::yes>(), N);
-
-                joint_matrix_load(
-                    sg, sub_c,
-                    accC.template get_multi_ptr<access::decorated::yes>(), N,
-                    layout::row_major);
+                joint_matrix_load(sg, sub_a, accA.template get_multi_ptr(), K);
+                joint_matrix_load(sg, sub_b, accB.template get_multi_ptr(), N);
+                joint_matrix_load(sg, sub_c, accC.template get_multi_ptr(), N,
+                                  layout::row_major);
 
                 joint_matrix_apply(sg, sub_a, [=](InType v) { return v * 2; });
                 joint_matrix_apply(sg, sub_b, [=](InType v) { return v * 3; });
                 joint_matrix_apply(sg, sub_c, [=](OutType v) { return v * 4; });
 
                 sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
 
-                joint_matrix_store(
-                    sg, sub_c,
-                    accD.template get_multi_ptr<access::decorated::yes>(), N,
-                    layout::row_major);
+                joint_matrix_store(sg, sub_c, accD.template get_multi_ptr(), N,
+                                   layout::row_major);
               });
         })
         .wait();
 
@@ -9,7 +9,7 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 using sycl::ext::oneapi::bfloat16;
 
 template <typename InType, typename OutType, size_t M, size_t N, size_t K>
-void hip_matrix_mfma() {
+void hip_matrix_fill() {
   InType A[M * K];
   InType B[K * N];
   OutType C[M * N];
@@ -61,10 +61,8 @@ void hip_matrix_mfma() {
 
                 sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
 
-                joint_matrix_store(
-                    sg, sub_c,
-                    accD.template get_multi_ptr<access::decorated::yes>(), N,
-                    layout::row_major);
+                joint_matrix_store(sg, sub_c, accD.template get_multi_ptr(), N,
+                                   layout::row_major);
               });
         })
         .wait();
 
@@ -3,6 +3,8 @@
 
 // REQUIRES: gpu-amd-gfx90a
 
+#include "joint_matrix_hip_apply.hpp"
+#include "joint_matrix_hip_fill.hpp"
 #include "joint_matrix_hip_mfma.hpp"
 
 int main() {
@@ -11,10 +13,21 @@ int main() {
   hip_matrix_mfma<bfloat16, float, 32, 32, 8, layout::row_major>();
   hip_matrix_mfma<bfloat16, float, 16, 16, 16, layout::row_major>();
   hip_matrix_mfma<double, double, 16, 16, 4, layout::row_major>();
-
   hip_matrix_mfma<int8_t, int32_t, 32, 32, 8, layout::col_major>();
   hip_matrix_mfma<int8_t, int32_t, 16, 16, 16, layout::col_major>();
   hip_matrix_mfma<bfloat16, float, 32, 32, 8, layout::col_major>();
   hip_matrix_mfma<bfloat16, float, 16, 16, 16, layout::col_major>();
   hip_matrix_mfma<double, double, 16, 16, 4, layout::col_major>();
+
+  hip_matrix_fill<int8_t, int32_t, 32, 32, 8>();
+  hip_matrix_fill<int8_t, int32_t, 16, 16, 16>();
+  hip_matrix_fill<bfloat16, float, 32, 32, 8>();
+  hip_matrix_fill<bfloat16, float, 16, 16, 16>();
+  hip_matrix_fill<double, double, 16, 16, 4>();
+
+  hip_matrix_apply<int8_t, int32_t, 32, 32, 8>();
+  hip_matrix_apply<int8_t, int32_t, 16, 16, 16>();
+  hip_matrix_apply<bfloat16, float, 32, 32, 8>();
+  hip_matrix_apply<bfloat16, float, 16, 16, 16>();
+  hip_matrix_apply<double, double, 16, 16, 4>();
 }
@@ -0,0 +1,22 @@
+// RUN: %{build} -fsycl -fsycl-targets=amd_gpu_gfx90a -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
+// RUN: %{run} %t.out
+
+// REQUIRES: gpu-amd-gfx90a
+// REQUIRES: aspect-fp16
+
+#include "joint_matrix_hip_apply.hpp"
+#include "joint_matrix_hip_fill.hpp"
+#include "joint_matrix_hip_mfma.hpp"
+
+int main() {
+  hip_matrix_fill<sycl::half, float, 32, 32, 8, layout::row_major>();
+  hip_matrix_fill<sycl::half, float, 16, 16, 16, layout::row_major>();
+  hip_matrix_fill<sycl::half, float, 32, 32, 8, layout::col_major>();
+  hip_matrix_fill<sycl::half, float, 16, 16, 16, layout::col_major>();
+
+  hip_matrix_fill<sycl::half, float, 32, 32, 8>();
+  hip_matrix_fill<sycl::half, float, 16, 16, 16>();
+
+  hip_matrix_apply<sycl::half, float, 32, 32, 8>();
+  hip_matrix_apply<sycl::half, float, 16, 16, 16>();
+}
@@ -71,25 +71,15 @@ void hip_matrix_mfma() {
                 joint_matrix<sub_group, InType, use::a, M, K, layout::col_major>
                     sub_a{};
 
-                joint_matrix_load(
-                    sg, sub_a,
-                    accA.template get_multi_ptr<access::decorated::yes>(), K);
-
-                joint_matrix_load(
-                    sg, sub_b,
-                    accB.template get_multi_ptr<access::decorated::yes>(), N);
-
-                joint_matrix_load(
-                    sg, sub_c,
-                    accC.template get_multi_ptr<access::decorated::yes>(), N,
-                    layout::row_major);
+                joint_matrix_load(sg, sub_a, accA.template get_multi_ptr(), K);
+                joint_matrix_load(sg, sub_b, accB.template get_multi_ptr(), N);
+                joint_matrix_load(sg, sub_c, accC.template get_multi_ptr(), N,
+                                  layout::row_major);
 
                 sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
 
-                joint_matrix_store(
-                    sg, sub_c,
-                    accD.template get_multi_ptr<access::decorated::yes>(), N,
-                    OutLayout);
+                joint_matrix_store(sg, sub_c, accD.template get_multi_ptr(), N,
+                                   OutLayout);
               });
         })
         .wait();