Enforce dpas template arg checks, Fix dpasw(), Fix in-tree LIT test.

v-klochkov · v-klochkov · commit 25db74d16600 · 2022-09-21T15:29:33.000-07:00
Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp b/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
@@ -126,23 +126,43 @@ constexpr int verify_parameters_and_deduce_exec_size() {
 
   if constexpr (APrecision == dpas_argument_type::FP16 ||
                 BPrecision == dpas_argument_type::FP16) {
-    static_assert(APrecision == BPrecision &&
-                      __ESIMD_DNS::is_type<T, float, sycl::half>() &&
-                      __ESIMD_DNS::is_type<CT, float, sycl::half>(),
-                  "Unsupported DPAS types! The supported types are:\n"
-                  " Result |   C   |   B  |  A  \n"
-                  " f, hf  | f, hf |  hf  |  hf \n");
+    if constexpr (ExecutionSize == 8) {
+      static_assert(APrecision == BPrecision &&
+                        __ESIMD_DNS::is_type<T, float>() &&
+                        __ESIMD_DNS::is_type<CT, float>(),
+                    "Unsupported DPAS types! The supported types are:\n"
+                    " Result |   C   |   B  |  A  \n"
+                    "   f    |   f   |  hf  |  hf \n");
+    } else {
+      static_assert(APrecision == BPrecision &&
+                        __ESIMD_DNS::is_type<T, float, sycl::half>() &&
+                        __ESIMD_DNS::is_type<CT, float, sycl::half>(),
+                    "Unsupported DPAS types! The supported types are:\n"
+                    " Result |   C   |   B  |  A  \n"
+                    " f, hf  | f, hf |  hf  |  hf \n");
+    }
   } else if constexpr (APrecision == dpas_argument_type::BF16 ||
                        BPrecision == dpas_argument_type::BF16) {
     using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
-    static_assert(APrecision == BPrecision &&
-                      __ESIMD_DNS::is_type<T, float, bfloat16>() &&
-                      __ESIMD_DNS::is_type<CT, float, bfloat16>(),
-                  "Unsupported DPAS types! The supported types are:\n"
-                  " Result |   C   |   B  |  A        \n"
-                  " f, bf  | f, bf |  bf  |  bf       \n");
+    if constexpr (ExecutionSize == 8) {
+      static_assert(APrecision == BPrecision &&
+                        __ESIMD_DNS::is_type<T, float, bfloat16>() &&
+                        __ESIMD_DNS::is_type<CT, float, bfloat16>(),
+                    "Unsupported DPAS types! The supported types are:\n"
+                    " Result |   C   |   B  |  A        \n"
+                    "   f    |   f   |  bf  |  bf       \n");
+    } else {
+      static_assert(APrecision == BPrecision &&
+                        __ESIMD_DNS::is_type<T, float, bfloat16>() &&
+                        __ESIMD_DNS::is_type<CT, float, bfloat16>(),
+                    "Unsupported DPAS types! The supported types are:\n"
+                    " Result |   C   |   B  |  A        \n"
+                    " f, bf  | f, bf |  bf  |  bf       \n");
+    }
   } else if constexpr (APrecision == dpas_argument_type::TF32 ||
                        BPrecision == dpas_argument_type::TF32) {
+    static_assert(ExecutionSize == 16,
+                  "tf32 type can be used only with ExecutionSize=16");
     static_assert(APrecision == BPrecision && std::is_same_v<T, float> &&
                       std::is_same_v<CT, float>,
                   "Unsupported DPAS types! The supported types are:\n"
@@ -223,7 +243,7 @@ auto dpas(__ESIMD_NS::simd<BT, BN> B, __ESIMD_NS::simd<AT, AN> A) {
       detail::verify_parameters_and_deduce_exec_size<SystolicDepth, RepeatCount,
                                                      T, T, BT, AT, BPrecision,
                                                      APrecision, BN, AN>();
-  // Result(_Mx_N) = A(_Mx_K) * B(_Kx_N) + C(_Mx_N)
+  // Result(_Mx_N) = A(_Mx_K) * B(_Kx_N)
   // where:
   //   _M = RepeatCount;
   //   _K = SystolicDepth * OpsPerChannel;
@@ -237,8 +257,10 @@ auto dpas(__ESIMD_NS::simd<BT, BN> B, __ESIMD_NS::simd<AT, AN> A) {
 
   constexpr int Info = (RepeatCount << 24) + (SystolicDepth << 16) +
                        ((int)APrecision << 8) + (int)BPrecision;
-  return __esimd_dpas_nosrc0<Info, T, int, int, ResultN, BNCasted, ANCasted>(
-      BCasted.data(), ACasted.data());
+  __ESIMD_NS::simd<T, ResultN> Result =
+      __esimd_dpas_nosrc0<Info, T, int, int, ResultN, BNCasted, ANCasted>(
+          BCasted.data(), ACasted.data());
+  return Result;
 }
 
 /// DPAS (Dot Product Accumulate Systolic)
@@ -283,24 +305,32 @@ template <
     int SystolicDepth, int RepeatCount, typename T, typename BT, typename AT,
     dpas_argument_type BPrecision = detail::dpas_precision_from_type<BT>(),
     dpas_argument_type APrecision = detail::dpas_precision_from_type<AT>(),
-    int N, int BN, int AN>
-__ESIMD_NS::simd<T, N> dpasw(__ESIMD_NS::simd<BT, BN> B,
-                             __ESIMD_NS::simd<AT, AN> A) {
+    int BN, int AN>
+auto dpasw(__ESIMD_NS::simd<BT, BN> B, __ESIMD_NS::simd<AT, AN> A) {
 
   constexpr bool IsDPASW = true;
-  (void)detail::verify_parameters_and_deduce_exec_size<
+  constexpr int ExecutionSize = detail::verify_parameters_and_deduce_exec_size<
       SystolicDepth, RepeatCount, T, T, BT, AT, BPrecision, APrecision, BN, AN,
       IsDPASW>();
 
+  // Result(_Mx_N) = A(_Mx_K) * B(_Kx_N)
+  // where:
+  //   _M = RepeatCount;
+  //   _K = SystolicDepth * OpsPerChannel;
+  //   _N = ExecutionSize (unknown, but deducible), must be 8 or 16.
+  constexpr int ResultN = RepeatCount * ExecutionSize;
+
   constexpr int ANCasted = AN / (sizeof(int) / sizeof(AT));
   constexpr int BNCasted = BN / (sizeof(int) / sizeof(BT));
   __ESIMD_NS::simd<int, ANCasted> ACasted = A.template bit_cast_view<int>();
   __ESIMD_NS::simd<int, BNCasted> BCasted = B.template bit_cast_view<int>();
 
   constexpr int Info = (RepeatCount << 24) + (SystolicDepth << 16) +
                        ((int)APrecision << 8) + (int)BPrecision;
-  return __esimd_dpasw_nosrc0<Info, T, int, int, N, BNCasted, ANCasted>(
-      BCasted.data(), ACasted.data());
+  __ESIMD_NS::simd<T, ResultN> Result =
+      __esimd_dpasw_nosrc0<Info, T, int, int, ResultN, BNCasted, ANCasted>(
+          BCasted.data(), ACasted.data());
+  return Result;
 }
 
 /// @} sycl_esimd_xmx_systolic_array_api
diff --git a/sycl/test/esimd/dpas.cpp b/sycl/test/esimd/dpas.cpp
@@ -1,5 +1,5 @@
-// RUN: %clangxx -DESIMD_XE_HPC -O0 -fsycl -c -Xclang -emit-llvm %s -o %t
-// RUN: %clangxx -DESIMD_XE_HPC -O0 -fsycl -c -fsycl-device-only -Xclang -emit-llvm %s -o %t
+// RUN: %clangxx -O0 -fsycl -c -Xclang -emit-llvm %s -o %t
+// RUN: %clangxx -O0 -fsycl -c -fsycl-device-only -Xclang -emit-llvm %s -o %t
 // RUN: sycl-post-link -split-esimd -lower-esimd -O0 -S %t -o %t.table
 // RUN: FileCheck %s -input-file=%t_esimd_0.ll
 
@@ -27,13 +27,13 @@ void bar() {
 }
 
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo() {
-  simd<short, 16> A_ACC = 7;
+  simd<sycl::ext::oneapi::experimental::bfloat16, 16> A_ACC = 7;
   simd<int, 128> A_ISRC1 = 0;
   simd<int, 8> A_ISRC2 = 0;
   simd<float, 16> A_DST =
       dpas<argument_type::BF16, argument_type::BF16, float, 8, 1>(
           A_ACC, A_ISRC1, A_ISRC2);
-  // CHECK: call <16 x float> @llvm.genx.dpas2.v16f32.v16i16.v128i32.v8i32(<16 x i16> {{[^,]+}}, <128 x i32> {{[^,]+}}, <8 x i32> {{[^,]+}}, i32 9, i32 9, i32 8, i32 1, i32 1, i32 1)
+  // CHECK: call <16 x float> @llvm.genx.dpas2.v16f32.v16i16.v128i32.v8i32(<16 x i16> {{[^,]+}}, <128 x i32> {{[^,]+}}, <8 x i32> {{[^,]+}}, i32 9, i32 9, i32 8, i32 1, i32 1, i32 0)
 
   simd<float, 16> B_ACC = 7;
   simd<int, 128> B_ISRC1 = 0;
@@ -49,16 +49,23 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo() {
           C_ISRC1, C_ISRC2);
   // CHECK: call <16 x float> @llvm.genx.dpas.nosrc0.v16f32.v128i32.v8i32(<128 x i32> {{[^,]+}}, <8 x i32> {{[^,]+}}, i32 {{[^,]+}})
 
-  simd<float, 16> D_ACC = 7;
-  simd<int, 128> D_ISRC1 = 0;
-  simd<int, 8> D_ISRC2 = 0;
-  simd<float, 16> D_DST = dpasw<argument_type::BF16, argument_type::BF16, 8, 1>(
+  simd<float, 8> D_ACC =
+      7; // MxN: 1x8 floats (M=RepeatCount=1, N=ExecutionSize=8)
+  simd<int, 64> D_ISRC1 =
+      0; // KxN: 16x8 bf16: (K=SysDepth*OpsPerChan=8*2, N=ExecutionSize=8)
+  simd<int, 4> D_ISRC2 =
+      0; // MxK/2: 1x8 bf16: (M=RepeatCount=1, K=SysDepth*OpsPerChan=8*2)
+  // Result is MxN: 1x8 floats
+  simd<float, 8> D_DST = dpasw<argument_type::BF16, argument_type::BF16, 8, 1>(
       D_ACC, D_ISRC1, D_ISRC2);
-  // CHECK: call <16 x float> @llvm.genx.dpasw.v16f32.v128i32.v8i32(<16 x float> {{[^,]+}}, <128 x i32> {{[^,]+}}, <8 x i32> {{[^,]+}}, i32 {{[^,]+}})
+  // CHECK: call <8 x float> @llvm.genx.dpasw.v8f32.v64i32.v4i32(<8 x float> {{[^,]+}}, <64 x i32> {{[^,]+}}, <4 x i32> {{[^,]+}}, i32 {{[^,]+}})
 
-  simd<int, 128> E_ISRC1 = 0;
-  simd<int, 8> E_ISRC2 = 0;
-  simd<float, 16> E_DST = dpasw2<argument_type::BF16, argument_type::BF16, 8, 1,
-                                 float, int, int, 16>(E_ISRC1, E_ISRC2);
-  // CHECK: call <16 x float> @llvm.genx.dpasw.nosrc0.v16f32.v128i32.v8i32(<128 x i32> {{[^,]+}}, <8 x i32> {{[^,]+}}, i32 {{[^,]+}})
+  simd<int, 64> E_ISRC1 =
+      0; // KxN: 16x8 bf16: K=SysDepth*OPC=8*2, N=ExecutionSize=8
+  simd<int, 4> E_ISRC2 =
+      0; // MxK/2: 1x16/2 bf16: M=RepeatCount, K=SysDepth*OPC=8*2
+  // Result is MxN: 1x8 floats
+  simd<float, 8> E_DST = dpasw2<argument_type::BF16, argument_type::BF16, 8, 1,
+                                float, int, int, 8>(E_ISRC1, E_ISRC2);
+  // CHECK: call <8 x float> @llvm.genx.dpasw.nosrc0.v8f32.v64i32.v4i32(<64 x i32> {{[^,]+}}, <4 x i32> {{[^,]+}}, i32 {{[^,]+}})
 }