Adds separate test comparing wi_marray with get_wi_data usage.

JackAKirk · JackAKirk · commit 5ea081ad7a26 · 2022-06-08T11:39:47.000+01:00
Signed-off-by: JackAKirk &lt;jack.kirk@codeplay.com&gt;
diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -0,0 +1,67 @@
+//==----------- element_wise_wi_marray.cpp  - DPC++ joint_matrix------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out 
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+using sycl::ext::oneapi::experimental::bfloat16;
+
+#define SG_SZ 32
+
+template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
+  int err = 0;
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+
+       cgh.parallel_for<class marray_kernel>(
+           nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}),
+           [ERR](nd_item<2> spmd_item)[[sycl::reqd_sub_group_size(SG_SZ)]] {
+             auto sg = spmd_item.get_sub_group();
+
+             joint_matrix<T, matrix_use::a, M, K> sub_a;
+             joint_matrix<T, matrix_use::a, M, K> sub_a_2;
+
+             joint_matrix_fill(sg, sub_a, -1);
+             joint_matrix_fill(sg, sub_a_2, -1);
+
+             auto wi_slice_a = sub_a.get_wi_data();
+             for (int i = 0; i < wi_slice_a.length(); i++) {
+               wi_slice_a[i] = fabs(wi_slice_a[i]);
+             }
+             sub_a_2.wi_marray = fabs(sub_a_2.wi_marray);
+
+             for (int i = 0; i < sub_a_2.wi_marray.size(); i++) {
+               if (sub_a_2.wi_marray[i] != wi_slice_a[i]) {
+                 ERR[0] = 1;
+               }
+             }
+           }); // parallel for
+     })
+        .wait();
+  }
+  assert(err == 0);
+}
+
+int main() {
+
+  queue q;
+  auto computeCapability =
+      std::stof(q.get_device().get_info<info::device::backend_version>());
+
+  if (computeCapability >= 8.0) {
+    verify_wi_marray<bfloat16, 16, 16>(q);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -68,7 +68,7 @@ T2 matrix_ref_mn(const int &m, const int &n, T1 *A, T1 *B, T2 *C) {
       res += make_fp32(A[m * Big_K + k]) * make_fp32(B[k * Big_N + n]);
   } else if constexpr (std::is_same<T1, bfloat16>::value) {
     for (int k = 0; k < Big_K; k++)
-      res += (make_fp32(A[m * Big_K + k].raw()) * 2 + 1) *
+      res += make_fp32(A[m * Big_K + k].raw()) *
              make_fp32(B[k * Big_N + n].raw());
   } else {
     for (int k = 0; k < Big_K; k++)
@@ -192,14 +192,6 @@ void test(queue &q) {
                                 accA.get_pointer() + (k * K) + (m * M * Big_K),
                                 Big_K);
 
-              if constexpr (std::is_same<T1, bfloat16>::value) {
-                marray<bfloat16, sub_a.wi_marray.size()> b, c;
-                b = 2;
-                c = 1;
-                sub_a.wi_marray =
-                    sycl::ext::oneapi::experimental::fma(sub_a.wi_marray, b, c);
-              }
-
               joint_matrix_load(sg, sub_b,
                                 accB.get_pointer() + (k * K * Big_N) + (n * N),
                                 Big_N);