Update use cases of mad to have variables holding result of mad as a parameter of the function.

mmoadeli · mmoadeli · commit 42e0c62ee83b · 2023-10-12T10:35:51.000+01:00
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp
@@ -265,7 +265,7 @@ template <typename Group,
           size_t M, size_t N, access::address_space Space,
           access::decorated IsDecorated>
 void store_layoutT(
-    joint_matrix_hip<
+    const joint_matrix_hip<
         T, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
         sycl::ext::oneapi::experimental::matrix::layout::dynamic> &src,
     multi_ptr<T, Space, IsDecorated> dst, size_t stride, Group &sg) {
@@ -333,7 +333,7 @@ void store_layoutT(
 template <typename Group, typename T, size_t M, size_t N,
           access::address_space Space, access::decorated IsDecorated>
 void joint_matrix_store_hip(
-    joint_matrix_hip<
+    const joint_matrix_hip<
         T, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
         sycl::ext::oneapi::experimental::matrix::layout::dynamic> &src,
     multi_ptr<T, Space, IsDecorated> dst, size_t stride,
@@ -356,11 +356,11 @@ void joint_matrix_mad_hip(
     joint_matrix_hip<
         Tc, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
         sycl::ext::oneapi::experimental::matrix::layout::dynamic> &D,
-    joint_matrix_hip<Tm, sycl::ext::oneapi::experimental::matrix::use::a, M, K,
-                     LayoutA> &A,
-    joint_matrix_hip<Tm, sycl::ext::oneapi::experimental::matrix::use::b, K, N,
-                     LayoutB> &B,
-    joint_matrix_hip<
+    const joint_matrix_hip<Tm, sycl::ext::oneapi::experimental::matrix::use::a,
+                           M, K, LayoutA> &A,
+    const joint_matrix_hip<Tm, sycl::ext::oneapi::experimental::matrix::use::b,
+                           K, N, LayoutB> &B,
+    const joint_matrix_hip<
         Tc, sycl::ext::oneapi::experimental::matrix::use::accumulator, M, N,
         sycl::ext::oneapi::experimental::matrix::layout::dynamic> &C) {
   if constexpr (std::is_same_v<Tm, sycl::half>) {
@@ -387,12 +387,12 @@ void joint_matrix_mad_hip(
   } else if constexpr (std::is_same_v<Tm, int8_t>) {
     if constexpr (M == 16 && N == 16) {
       D.data = __builtin_amdgcn_mfma_i32_16x16x16i8(
-          *reinterpret_cast<int32_t *>(A.data),
-          *reinterpret_cast<int32_t *>(B.data), C.data, 0, 0, 0);
+          *reinterpret_cast<const Tc *>(A.data),
+          *reinterpret_cast<const Tc *>(B.data), C.data, 0, 0, 0);
     } else if constexpr (M == 32 && N == 32) {
       D.data = __builtin_amdgcn_mfma_i32_32x32x8i8(
-          *reinterpret_cast<int32_t *>(A.data),
-          *reinterpret_cast<int32_t *>(B.data), C.data, 0, 0, 0);
+          *reinterpret_cast<const Tc *>(A.data),
+          *reinterpret_cast<const Tc *>(B.data), C.data, 0, 0, 0);
     }
   } else {
     static_assert(false && "Invalid configuration!");
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -192,7 +192,6 @@ joint_matrix_fill(Group,
 #if defined(__NVPTX__)
   res.cuda_impl.wi_marray = v;
 #elif defined(__HIP_PLATFORM_AMD_MFMA__)
-  std::ignore = sg;
   sycl::ext::oneapi::detail::joint_matrix_apply(res.hip_impl,
                                                 [=](T) { return v; });
 #else
@@ -219,7 +218,7 @@ template <
     std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value, bool> =
         true>
 inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
-    Group,
+    Group &sg,
     joint_matrix<Group, S, use::accumulator, NumRows, NumCols,
                  sycl::ext::oneapi::experimental::matrix::layout::dynamic> &res,
     multi_ptr<T, Space, IsDecorated> src, size_t stride,
@@ -228,6 +227,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
   static_assert(Space != access::address_space::private_space,
                 "Joint Matrix doesn't support load from private memory!");
 #if defined(__NVPTX__)
+  std::ignore = sg;
   sycl::ext::oneapi::detail::load_accumulator_cuda(res.cuda_impl, src, stride,
                                                    Layout);
 #elif defined(__HIP_PLATFORM_AMD_MFMA__)
@@ -266,6 +266,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
   }
 #endif // defined(__NVPTX__)
 #else
+  std::ignore = sg;
   std::ignore = res;
   std::ignore = src;
   std::ignore = stride;
@@ -284,13 +285,14 @@ template <
                           std::is_same<std::remove_const_t<T>, float>::value),
                      bool> = true>
 inline __SYCL_ALWAYS_INLINE void
-joint_matrix_load(Group,
+joint_matrix_load(Group &sg,
                   joint_matrix<Group, S, Use, NumRows, NumCols, Layout> &res,
                   multi_ptr<T, Space, IsDecorated> src, size_t stride) {
 #if defined(__SYCL_DEVICE_ONLY__)
   static_assert(Space != access::address_space::private_space,
                 "Joint Matrix doesn't support load from private memory!");
 #if defined(__NVPTX__)
+  std::ignore = sg;
   sycl::ext::oneapi::detail::load_multiplicand_cuda<S, T, NumRows, NumCols, Use,
                                                     Layout, Space>(
       res.cuda_impl, src, stride);
@@ -320,7 +322,7 @@ joint_matrix_load(Group,
 template <typename Group, typename T, size_t NumRows, size_t NumCols,
           access::address_space Space, access::decorated IsDecorated>
 inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
-    Group,
+    Group &sg,
     const joint_matrix<Group, T, use::accumulator, NumRows, NumCols,
                        sycl::ext::oneapi::experimental::matrix::layout::dynamic>
         &src,
@@ -330,6 +332,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
   static_assert(Space != access::address_space::private_space,
                 "Joint Matrix doesn't support store to private memory!");
 #if defined(__NVPTX__)
+  std::ignore = sg;
   sycl::ext::oneapi::detail::joint_matrix_store_cuda<T, NumRows, NumCols,
                                                      Space>(src.cuda_impl, dst,
                                                             stride, Layout);
@@ -403,13 +406,9 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_mad(
   }
 #elif defined(__HIP_PLATFORM_AMD_MFMA__)
   if constexpr (std::is_same<Ta, Tb>::value) {
-    joint_matrix<Group, Tc, use::accumulator, M, N,
-                 sycl::ext::oneapi::experimental::matrix::layout::dynamic>
-        D;
     sycl::ext::oneapi::detail::joint_matrix_mad_hip<Ta, Tc, M, K, N, LayoutA,
                                                     LayoutB>(
         D.hip_impl, A.hip_impl, B.hip_impl, C.hip_impl);
-    return D;
   } else {
     assert(false && "Ta != Tb : In the HIP backend joint_matrix_mad "
                     "requires that joint_matrix data types Ta and Tb match");
diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_apply.hpp
@@ -70,7 +70,7 @@ void hip_matrix_apply() {
                 joint_matrix_apply(sg, sub_b, [=](InType v) { return v * 3; });
                 joint_matrix_apply(sg, sub_c, [=](OutType v) { return v * 4; });
 
-                sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+                joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
 
                 joint_matrix_store(
                     sg, sub_c,
diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_fill.hpp
@@ -59,7 +59,7 @@ void hip_matrix_fill() {
                 joint_matrix_fill(sg, sub_b, 2);
                 joint_matrix_fill(sg, sub_c, 3);
 
-                sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+                joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
 
                 joint_matrix_store(
                     sg, sub_c,
diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp b/sycl/test-e2e/Matrix/joint_matrix_hip_mfma.hpp
@@ -82,7 +82,7 @@ void hip_matrix_mfma() {
                     accC.template get_multi_ptr<access::decorated::yes>(), N,
                     layout::row_major);
 
-                sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+                joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
 
                 joint_matrix_store(
                     sg, sub_c,