2d block load/store use simt1, fix JM 16x16 store

YuriPlyakhin · igcbot · commit 2c69abc2a454 · 2024-02-01T22:54:46.000+01:00
Execution mask for 2D Block Load/Store Message should be SIMT1
Also Block Height for 2D Block Store is 8
hence to use 2d block store for matrix 16x16, need to split
it to 2d block store instructions
diff --git a/IGC/BiFModule/Languages/OpenCL/PreRelease/IBiF_matrix.cl b/IGC/BiFModule/Languages/OpenCL/PreRelease/IBiF_matrix.cl
@@ -172,7 +172,7 @@ SPDX-License-Identifier: MIT
   /* not supported, fallthrough */
 #define IMPLEMENT_BLOCK2D_LOAD_VNNI_TX(element_type, elem_bitwidth, contrib_type, contrib_bitwidth, M, K, stride_opt) \
   /* not supported, fallthrough */
-#define IMPLEMENT_BLOCK2D_STORE(element_type, contrib_type, contrib_bitwidth, M, K, vec) \
+#define IMPLEMENT_BLOCK2D_STORE(element_type, contrib_type, contrib_bitwidth, M, K) \
   /* not supported, fallthrough */
 
 #define IMPLEMENT_BLOCK2D_LOAD_SG16_ROW_MAJOR(element_type, elem_bitwidth, contrib_type, contrib_bitwidth, M, K, stride_opt) \
@@ -216,7 +216,7 @@ SPDX-License-Identifier: MIT
     *(__private OUT_VEC##M(u##contrib_type) *)dst = res; \
     return;
 
-#define IMPLEMENT_BLOCK2D_STORE_SG16(element_type, contrib_type, contrib_bitwidth, M, K, vec) \
+#define IMPLEMENT_BLOCK2D_STORE_SG16(element_type, contrib_type, contrib_bitwidth, M, K) \
     long offset = as_long(mem); \
     long baseoffset = offset & (~0x3f); /* align to 64-byte */ \
     int width = (sizeof (element_type)) * stride - 1; /* in bytes */ \
@@ -470,7 +470,7 @@ DEFINE_LOAD(Accumulator_ColumnMajor, _SG16, int, 32, int, 32, 2, 16, 2x16, COL_M
       if (WI_rows == M && BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= BLOCK2D_IMPL && (M == 2 || M == 4 || M == 8) \
           && order == _ROW_MAJOR && address_space == AS_GLOBAL && elem_bitwidth > 8 \
           ) { \
-          IMPLEMENT_BLOCK2D_STORE##sg(element_type, contrib_type, contrib_bitwidth, M, K, src) \
+          IMPLEMENT_BLOCK2D_STORE##sg(element_type, contrib_type, contrib_bitwidth, M, K) \
       } \
       if (WI_rows == M && BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= VECTOR_CONT_IMPL && stride == stride_opt \
           && (M == 2 || M == 4 || M == 8) && order == _ROW_MAJOR \
@@ -955,10 +955,17 @@ DEFINE_B_B_16x64(local)
 #define DEFINE_STORE_IMPL_LARGE(layout, sg, element_type, elem_bitwidth, contrib_type, contrib_bitwidth, M, K, shape, order, us, stride_opt, address_space) \
   INLINE void MANGLE_STORE_NAME_##address_space(layout, sg, elem_bitwidth, shape, M) (char *mem, __private char *src, long stride) { \
       int sg_size = get_sub_group_size(); \
-      if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= BLOCK2D_IMPL && (M == 2 || M == 4 || M == 8) \
-          && order == _ROW_MAJOR && address_space == AS_GLOBAL && elem_bitwidth > 8 \
-          ) { \
-          IMPLEMENT_BLOCK2D_STORE##sg(element_type, contrib_type, contrib_bitwidth, M, K, src) \
+      if (BIF_FLAG_CTRL_GET(JointMatrixLoadStoreOpt) >= BLOCK2D_IMPL && M == 16 \
+          && order == _ROW_MAJOR && address_space == AS_GLOBAL && elem_bitwidth > 8) { \
+          __private char *c0 = src + 0 * 8 * (sizeof (int)); \
+          __private char *c1 = src + 1 * 8 * (sizeof (int)); \
+\
+          char *mem0 = mem; \
+          char *mem1 = mem + 8 * (sizeof (int)) * stride; \
+\
+          __builtin_spriv_OpJointMatrixStoreINTEL_Accumulator_RowMajor_SG16_8x16_i32_8_global_pi64_v8i8(mem0, c0, stride); \
+          __builtin_spriv_OpJointMatrixStoreINTEL_Accumulator_RowMajor_SG16_8x16_i32_8_global_pi64_v8i8(mem1, c1, stride); \
+          return; \
       } \
       contrib_type *ptr = (contrib_type *)mem; \
       int slid = get_sub_group_local_id(); \
diff --git a/IGC/Compiler/CISACodeGen/CISABuilder.cpp b/IGC/Compiler/CISACodeGen/CISABuilder.cpp
@@ -8830,7 +8830,7 @@ namespace IGC
         LSC_CACHE_OPTS cacheOpts)
     {
         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
-        VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize);
+        VISA_Exec_Size execSize = EXEC_SIZE_1;
         VISA_EMask_Ctrl mask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask);
         LSC_DATA_SHAPE_BLOCK2D dataShape2D{};
         dataShape2D.size = LSC_GetElementSize(elemSize, true);

Original file line number	Diff line number	Diff line change
`@@ -8830,7 +8830,7 @@ namespace IGC`
`8830`	`8830`	`LSC_CACHE_OPTS cacheOpts)`
`8831`	`8831`	`{`
`8832`	`8832`	`VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);`
`8833`		`- VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize);`
	`8833`	`+ VISA_Exec_Size execSize = EXEC_SIZE_1;`
`8834`	`8834`	`VISA_EMask_Ctrl mask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask);`
`8835`	`8835`	`LSC_DATA_SHAPE_BLOCK2D dataShape2D{};`
`8836`	`8836`	`dataShape2D.size = LSC_GetElementSize(elemSize, true);`