[SYCL][JointMatrix] Added missing required subgroup to VNNI tests (#10565)

YuriPlyakhin · web-flow · commit 2cfdfa4ef69c · 2023-07-26T15:08:05.000-03:00
Because of missing required sub_group setting the tests were failing.
The only change is adding "[[intel::reqd_sub_group_size(SG_SZ)]]". The
rest is clang-formatting.
diff --git a/sycl/test-e2e/Matrix/Legacy/joint_matrix_int8_vnni_impl.hpp b/sycl/test-e2e/Matrix/Legacy/joint_matrix_int8_vnni_impl.hpp
@@ -37,42 +37,41 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
          [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
-
-         {
-           // The submatrix API has to be accessed by all the workitems in a
-           // subgroup these functions will be called once by the subgroup no
-           // code divergence between the workitems
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<int8_t, TM, TK> sub_a(sg);
-           joint_matrix<int8_t, TK, TN, matrix_layout::packed_b> sub_b(sg);
-           joint_matrix<int32_t, TM, TN> sub_c(sg);
-
-           joint_matrix_fill(sg, sub_c, 0);
-           for (int k = 0; k < K / TK; k += 1) {
-             joint_matrix_load(
-                 sg, sub_a,
-                 accA.template get_multi_ptr<access::decorated::no>() +
-                     (sg_startx * TM) * K + k * TK,
-                 K, matrix_layout::row_major);
-             // VNNI transform is done automatically at this level
-             joint_matrix_load(
-                 sg, sub_b,
-                 accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK) * N + sg_starty / SG_SZ * TN,
-                 N, matrix_layout::row_major);
-             sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
-           }
-           joint_matrix_store(
-               sg, sub_c,
-               accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
-               N, matrix_layout::row_major);
-         }); // parallel for
+             [[intel::reqd_sub_group_size(SG_SZ)]] {
+               // The submatrix API has to be accessed by all the workitems in a
+               // subgroup these functions will be called once by the subgroup
+               // no code divergence between the workitems
+               const auto global_idx = spmd_item.get_global_id(0);
+               const auto global_idy = spmd_item.get_global_id(1);
+               const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+               const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+               sub_group sg = spmd_item.get_sub_group();
+               joint_matrix<int8_t, TM, TK> sub_a(sg);
+               joint_matrix<int8_t, TK, TN, matrix_layout::packed_b> sub_b(sg);
+               joint_matrix<int32_t, TM, TN> sub_c(sg);
+
+               joint_matrix_fill(sg, sub_c, 0);
+               for (int k = 0; k < K / TK; k += 1) {
+                 joint_matrix_load(
+                     sg, sub_a,
+                     accA.template get_multi_ptr<access::decorated::no>() +
+                         (sg_startx * TM) * K + k * TK,
+                     K, matrix_layout::row_major);
+                 // VNNI transform is done automatically at this level
+                 joint_matrix_load(
+                     sg, sub_b,
+                     accB.template get_multi_ptr<access::decorated::no>() +
+                         (k * TK) * N + sg_starty / SG_SZ * TN,
+                     N, matrix_layout::row_major);
+                 sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+               }
+               joint_matrix_store(
+                   sg, sub_c,
+                   accC.template get_multi_ptr<access::decorated::no>() +
+                       (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   N, matrix_layout::row_major);
+             }); // parallel for
    }).wait();
 }
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp
@@ -36,9 +36,8 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
-
-         {
+         [accA, accB, accC, M, N,
+          K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems