[SYCL][Joint Matrix][E2E] Add tests for big shapes for col major A and B loads/stores (#16999)

YuriPlyakhin · web-flow · commit 1c2587ddd57c · 2025-02-26T09:22:37.000-08:00
diff --git a/sycl/test-e2e/Matrix/Inputs/common.hpp b/sycl/test-e2e/Matrix/Inputs/common.hpp
@@ -5,7 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+#include <bitset>
 #include <cmath>
+#include <iomanip>
 #include <iostream>
 #include <random>
 #include <sycl/detail/core.hpp>
@@ -18,6 +20,18 @@ namespace syclex = sycl::ext::oneapi::experimental;
 namespace syclintelex = sycl::ext::intel::experimental;
 using bfloat16 = sycl::ext::oneapi::bfloat16;
 
+void print_float_as_hex(float value) {
+  union {
+    float f;
+    uint32_t i;
+  } v;
+  v.f = value;
+
+  std::ios_base::fmtflags f(std::cout.flags());
+  std::cout << std::hex << std::setw(8) << std::setfill('0') << v.i;
+  std::cout.flags(f);
+}
+
 // Most of the time, failures related to floating-point calculations (both float
 // and bfloat16) are caused by accumulation errors rather than the algorithm
 // itself. If it is an algorithm issue, the calculated result gap from the
@@ -223,13 +237,19 @@ template <typename KernelName> size_t get_sg_size(queue q) {
 }
 
 template <typename T>
-void matrix_print(unsigned int rows, unsigned int cols, T *mat) {
+void matrix_print(unsigned int rows, unsigned int cols, T *mat,
+                  bool hex = false) {
   for (unsigned int i = 0; i < rows; i++) {
     for (unsigned int j = 0; j < cols; j++) {
       if constexpr (std::is_integral_v<T>)
         std::cout << (int)mat[i * cols + j] << " ";
-      else
-        std::cout << (float)mat[i * cols + j] << " ";
+      else {
+        if (hex)
+          print_float_as_hex((float)mat[i * cols + j]);
+        else
+          std::cout << (float)mat[i * cols + j];
+        std::cout << " ";
+      }
     }
     std::cout << "\n";
   }
diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp
@@ -9,7 +9,8 @@
 #include <iostream>
 #include <sycl/usm.hpp>
 
-template <typename Tab, size_t K, layout B_layout> class mult;
+template <typename Tab, size_t TM, size_t TN, size_t TK, layout B_layout>
+class mult;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K, size_t TM,
           size_t TN, size_t TK, layout A_layout, layout B_layout>
@@ -18,11 +19,11 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
   // Add one iteration for the out of bounds dpas instruction
   size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0);
   size_t NDRangeN = N / TN + (((N % TN) != 0) ? 1 : 0);
-  size_t sg_size = get_sg_size<mult<T2, K, B_layout>>(q);
+  size_t sg_size = get_sg_size<mult<T2, TM, TN, TK, B_layout>>(q);
   std::cout << "SG size: " << sg_size << " ";
 
   q.submit([&](handler &cgh) {
-     cgh.parallel_for<mult<T2, K, B_layout>>(
+     cgh.parallel_for<mult<T2, TM, TN, TK, B_layout>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
@@ -147,13 +148,38 @@ void test() {
 
 template <layout A_layout, layout B_layout> void test_all() {
   std::cout << "bf16: ";
-  test<bfloat16, float, /*MATRIX_M*/ 1024 + 20, /*MATRIX_N*/ 1024 + 20,
+  test<bfloat16, float, /*MATRIX_M*/ 1024 + 24, /*MATRIX_N*/ 1024 + 24,
        /*MATRIX_K*/ 1024 + 24, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, A_layout,
        B_layout>();
   std::cout << "half: ";
-  test<half, float, 1024 + 20, 1024 + 20, 1024 + 24, 8, 16, 16, A_layout,
+  test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 8, 16, 16, A_layout,
        B_layout>();
   std::cout << "int8: ";
-  test<int8_t, int32_t, 1024, 1024 + 20, 1024 + 24, 8, 16, 32, A_layout,
+  test<int8_t, int32_t, 1024, 1024, 1024 + 16, 8, 16, 32, A_layout, B_layout>();
+}
+
+template <layout A_layout, layout B_layout> void test_all_big_shapes() {
+  std::cout << "bf16: ";
+  test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 16, 16, 16, A_layout,
+       B_layout>();
+  test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 1, 64, 16, A_layout,
+       B_layout>();
+  test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 1, 64, 32, A_layout,
+       B_layout>();
+  test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 32, 64, 16, A_layout,
+       B_layout>();
+  test<bfloat16, float, 1024 + 24, 1024 + 24, 1024 + 24, 32, 64, 32, A_layout,
+       B_layout>();
+
+  std::cout << "half: ";
+  test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 16, 16, 16, A_layout,
+       B_layout>();
+  test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 1, 64, 16, A_layout,
+       B_layout>();
+  test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 1, 64, 32, A_layout,
+       B_layout>();
+  test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 32, 64, 16, A_layout,
+       B_layout>();
+  test<half, float, 1024 + 24, 1024 + 24, 1024 + 24, 32, 64, 32, A_layout,
        B_layout>();
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_16bit_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_16bit_colmajorA_colmajorB.cpp
@@ -29,13 +29,10 @@
 
 #include "common.hpp"
 
-constexpr size_t TM = 8;
-constexpr size_t TN = 16;
-constexpr size_t TK = 16;
+template <typename T, size_t TM, size_t TN, size_t TK> class imatrix;
 
-template <typename T> class imatrix;
-
-template <typename T1, typename T2, size_t M, size_t N, size_t K>
+template <size_t TM, size_t TN, size_t TK, typename T1, typename T2, size_t M,
+          size_t N, size_t K>
 void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                      big_matrix<T2, K, N> &B) {
   size_t NDRangeM = M / TM;
@@ -45,15 +42,15 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
-  size_t sg_size = get_sg_size<class imatrix<T2>>(q);
+  size_t sg_size = get_sg_size<class imatrix<T2, TM, TN, TK>>(q);
   std::cout << "subgroup size " << sg_size << " ";
 
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.template get_access<access::mode::read_write>(cgh);
      auto accB = bufB.template get_access<access::mode::read_write>(cgh);
 
-     cgh.parallel_for<class imatrix<T2>>(
+     cgh.parallel_for<class imatrix<T2, TM, TN, TK>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
@@ -100,10 +97,13 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
    }).wait();
 }
 
-template <typename T> void test() {
+template <typename T, size_t TM, size_t TN, size_t TK> void test() {
+  std::cout << TM << "x" << TN << "x" << TK << " ";
+
   static constexpr size_t MATRIX_M = TM * 2;
   static constexpr size_t MATRIX_N = TN * 2;
   static constexpr size_t MATRIX_K = TK * 2;
+
   T A[MATRIX_K][MATRIX_M];
   T B[MATRIX_N][MATRIX_K];
   float C[MATRIX_M][MATRIX_N];
@@ -120,7 +120,7 @@ template <typename T> void test() {
   big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
   big_matrix<T, MATRIX_M, MATRIX_K> MA((T *)&A);
   big_matrix<T, MATRIX_K, MATRIX_N> MB((T *)&B);
-  matrix_multiply(MC, MA, MB);
+  matrix_multiply<TM, TN, TK>(MC, MA, MB);
   matrix_multiply_ref((T *)A, (T *)B, (float *)D, MATRIX_M, MATRIX_N, MATRIX_K,
                       false, true, true);
 
@@ -138,13 +138,27 @@ int main() {
   for (auto &combination : combinations) {
     if (!bf16_run && combination.atype == matrix_type::bf16) {
       std::cout << "bf16 ";
-      test<bfloat16>();
+      test<bfloat16, 8, 16, 16>();
+#ifdef BIG_SHAPES
+      test<bfloat16, 16, 16, 16>();
+      test<bfloat16, 1, 64, 16>();
+      test<bfloat16, 1, 64, 32>();
+      test<bfloat16, 32, 64, 16>();
+      test<bfloat16, 32, 64, 32>();
+#endif
       bf16_run = true;
     }
 
     if (!half_run && combination.atype == matrix_type::fp16) {
       std::cout << "half ";
-      test<half>();
+      test<half, 8, 16, 16>();
+#ifdef BIG_SHAPES
+      test<half, 16, 16, 16>();
+      test<half, 1, 64, 16>();
+      test<half, 1, 64, 32>();
+      test<half, 32, 64, 16>();
+      test<half, 32, 64, 32>();
+#endif
       half_run = true;
     }
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
@@ -10,8 +10,6 @@
 // other triples
 
 // REQUIRES: aspect-ext_intel_matrix
-// XFAIL: arch-intel_gpu_bmg_g21
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16922
 // UNSUPPORTED: gpu-intel-dg2, cpu
 // UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_big_shapes.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_big_shapes.cpp
@@ -0,0 +1,36 @@
+//==----joint_matrix_out_bounds_big_shapes.cpp - DPC++ joint_matrix---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: target-nvidia, target-amd
+// UNSUPPORTED-INTENDED: aspect-ext_intel_matrix isn't currently supported for
+// other triples
+
+// REQUIRES: aspect-ext_intel_matrix
+
+// UNSUPPORTED: gpu-intel-dg2, cpu
+// UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// RUN: %{build} -o %t32.out -DSG_SZ=32
+// RUN: %{run} %t32.out
+
+// XFAIL:gpu
+// XFAIL-TRACKER: GSD-5768
+
+#include "common.hpp"
+#include "joint_matrix_out_bounds_impl.hpp"
+
+int main() {
+  std::cout << "A row major, B row major:\n";
+  test_all_big_shapes<layout::row_major, layout::row_major>();
+  std::cout << "A row major, B packed:\n";
+  test_all_big_shapes<layout::row_major, layout::ext_intel_packed>();
+  std::cout << "A col major, B col major:\n";
+  test_all_big_shapes<layout::col_major, layout::col_major>();
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp b/sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp
@@ -27,17 +27,17 @@
 #include "common.hpp"
 #include <sycl/usm.hpp>
 
-template <typename T, size_t TileRows, size_t TileCols> class MT;
+template <typename T, size_t TileRows, size_t TileCols, use Use> class MT;
 
 template <size_t TR, size_t TC, typename T, size_t NR, size_t NC, use Use>
 void matrix_transpose(T *in, T *out, queue q) {
   static_assert((NR % TR) == 0);
   static_assert((NC % TC) == 0);
-  size_t sg_size = get_sg_size<class MT<T, TR, TC>>(q);
+  size_t sg_size = get_sg_size<class MT<T, TR, TC, Use>>(q);
   std::cout << "SG size " << sg_size << " ";
 
   q.submit([&](handler &cgh) {
-     cgh.parallel_for<class MT<T, TR, TC>>(
+     cgh.parallel_for<class MT<T, TR, TC, Use>>(
          nd_range<2>({NR / TR, NC / TC * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
@@ -112,13 +112,31 @@ int main() {
       std::cout << "bf16:\n";
       test<bfloat16, 8, 16, use::a>();
       test<bfloat16, 16, 16, use::b>();
+#ifdef MORE_SHAPES
+      test<bfloat16, 1, 16, use::a>();
+      test<bfloat16, 1, 32, use::a>();
+      test<bfloat16, 16, 16, use::a>();
+      test<bfloat16, 32, 16, use::a>();
+      test<bfloat16, 32, 32, use::a>();
+      test<bfloat16, 16, 64, use::b>();
+      test<bfloat16, 32, 64, use::b>();
+#endif
       bf16_run = true;
     }
 
     if (!half_run && combination.atype == matrix_type::fp16) {
       std::cout << "half:\n";
       test<half, 8, 16, use::a>();
       test<half, 16, 16, use::b>();
+#ifdef MORE_SHAPES
+      test<half, 1, 16, use::a>();
+      test<half, 1, 32, use::a>();
+      test<half, 16, 16, use::a>();
+      test<half, 32, 16, use::a>();
+      test<half, 32, 32, use::a>();
+      test<half, 16, 64, use::b>();
+      test<half, 32, 64, use::b>();
+#endif
       half_run = true;
     }