rename intel_sub_group_matrix_mad (DPAS) for tf32 type

pkwasnie-intel · fda0 · commit 493e17c043e8 · 2024-06-18T09:04:50.000+02:00
Renames DPAS for tf32 from: intel_sub_group_tf32_tf32_matrix_mad_k8_f32 to (drops returned type): intel_sub_group_tf32_tf32_matrix_mad_k8 (cherry picked from commit 1b55151)
diff --git a/IGC/BiFModule/Languages/OpenCL/IBiF_dpas.cl b/IGC/BiFModule/Languages/OpenCL/IBiF_dpas.cl
@@ -489,10 +489,10 @@ DEFN_INTEL_CVT2( f32_to_bf16_packed,  int16, float16, float16, 2fto2bf_16 )
 #ifdef cl_intel_subgroup_matrix_multiply_accumulate_tf32
 // PVC_B
 
-DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float,   float,   float,   float8,  fdpas_f_f_tf32_tf32_8_1 )
-DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float2,  float2,  float,   float8,  fdpas_f_f_tf32_tf32_8_2 )
-DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float4,  float4,  float2,  float8,  fdpas_f_f_tf32_tf32_8_4 )
-DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8_f32, float8,  float8,  float4,  float8,  fdpas_f_f_tf32_tf32_8_8 )
+DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8, float,   float,   float,   float8,  fdpas_f_f_tf32_tf32_8_1 )
+DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8, float2,  float2,  float,   float8,  fdpas_f_f_tf32_tf32_8_2 )
+DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8, float4,  float4,  float2,  float8,  fdpas_f_f_tf32_tf32_8_4 )
+DEFN_INTEL_SG16_FDPAS( tf32_tf32_matrix_mad_k8, float8,  float8,  float4,  float8,  fdpas_f_f_tf32_tf32_8_8 )
 
 DEFN_INTEL_CVT_NO_OVERLOAD( tfloat32_as_float,     float,   float,   ftotf32_1  )
 DEFN_INTEL_CVT_NO_OVERLOAD( tfloat322_as_float2,   float2,  float2,  ftotf32_2  )
diff --git a/IGC/BiFModule/Languages/OpenCL/PreRelease/opencl_cth_pre_release.h b/IGC/BiFModule/Languages/OpenCL/PreRelease/opencl_cth_pre_release.h
@@ -2386,16 +2386,16 @@ int16 __attribute__((overloadable)) intel_convert_f32_to_bf16_packed(float16 a,
 // DST: float
 
 // M = 1, K = 8, N = 16, upper 8 channels of a ignored
-float  __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(float  a, float8 b, float  acc);
+float  __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8(float  a, float8 b, float  acc);
 
 // M = 2, K = 8, N = 16, all channels of a are used
-float2 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(float  a, float8 b, float2 acc);
+float2 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8(float  a, float8 b, float2 acc);
 
 // M = 4, K = 8, N = 16
-float4 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(float2 a, float8 b, float4 acc);
+float4 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8(float2 a, float8 b, float4 acc);
 
 // M = 8, K = 8, N = 16
-float8 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8_f32(float4 a, float8 b, float8 acc);
+float8 __attribute__((overloadable)) intel_sub_group_tf32_tf32_matrix_mad_k8(float4 a, float8 b, float8 acc);
 
 // Conversions
 float   intel_convert_tfloat32_as_float(    float   source);
diff --git a/IGC/common/Types.hpp b/IGC/common/Types.hpp
@@ -36,7 +36,7 @@ namespace IGC
     {
         PRECISION_UNUSED, U8, U4, U2, S8, S4, S2,
         BF8,
-        TF32,
+        TF32 = 10,
         BF16, FP16
     };
 
diff --git a/IGC/ocloc_tests/Builtins/cl_intel_subgroup_matrix_multiply_accumulate_tf32/dpas.ll b/IGC/ocloc_tests/Builtins/cl_intel_subgroup_matrix_multiply_accumulate_tf32/dpas.ll
@@ -0,0 +1,62 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; REQUIRES: llvm-spirv, regkeys, pvc-supported
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_cache_controls -o %t.spv
+; RUN: ocloc compile -spirv_input -file %t.spv -device pvc -options " -igc_opts 'PrintToConsole=1 PrintAfter=Layout'" 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+
+declare spir_func float @_Z39intel_sub_group_tf32_tf32_matrix_mad_k8fDv8_ff(float, <8 x float>, float)
+declare spir_func <2 x float> @_Z39intel_sub_group_tf32_tf32_matrix_mad_k8fDv8_fDv2_f(float, <8 x float>, <2 x float>)
+declare spir_func <4 x float> @_Z39intel_sub_group_tf32_tf32_matrix_mad_k8Dv2_fDv8_fDv4_f(<2 x float>, <8 x float>, <4 x float>)
+declare spir_func <8 x float> @_Z39intel_sub_group_tf32_tf32_matrix_mad_k8Dv4_fDv8_fS0_(<4 x float>, <8 x float>, <8 x float>)
+
+define spir_kernel void @test_v1(float %a, <8 x float> %b, float %acc, float addrspace(1)* %c) !intel_reqd_sub_group_size !100 {
+entry:
+; CHECK-LABEL: @test_v1(
+; CHECK:         call float @llvm.genx.GenISA.sub.group.dpas.f32.f32.f32.v8i32(float %acc, float %a, <8 x i32> %{{.+}}, i32 10, i32 10, i32 8, i32 1, i1 false)
+  %call = call spir_func float @_Z39intel_sub_group_tf32_tf32_matrix_mad_k8fDv8_ff(float %a, <8 x float> %b, float %acc)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 0
+  store float %call, float addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @test_v2(float %a, <8 x float> %b, <2 x float> %acc, <2 x float> addrspace(1)* %c) !intel_reqd_sub_group_size !100 {
+entry:
+; CHECK-LABEL: @test_v2(
+; CHECK:         call <2 x float> @llvm.genx.GenISA.sub.group.dpas.v2f32.v2f32.f32.v8i32(<2 x float> %acc, float %a, <8 x i32> %{{.+}}, i32 10, i32 10, i32 8, i32 2, i1 false)
+  %call = call spir_func <2 x float> @_Z39intel_sub_group_tf32_tf32_matrix_mad_k8fDv8_fDv2_f(float %a, <8 x float> %b, <2 x float> %acc)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %c, i64 0
+  store <2 x float> %call, <2 x float> addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @test_v4(<2 x float> %a, <8 x float> %b, <4 x float> %acc, <4 x float> addrspace(1)* %c) !intel_reqd_sub_group_size !100 {
+entry:
+; CHECK-LABEL: @test_v4(
+; CHECK:         call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v2f32.v8i32(<4 x float> %acc, <2 x float> %a, <8 x i32> %{{.+}}, i32 10, i32 10, i32 8, i32 4, i1 false)
+  %call = call spir_func <4 x float> @_Z39intel_sub_group_tf32_tf32_matrix_mad_k8Dv2_fDv8_fDv4_f(<2 x float> %a, <8 x float> %b, <4 x float> %acc)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c, i64 0
+  store <4 x float> %call, <4 x float> addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @test_v8(<4 x float> %a, <8 x float> %b, <8 x float> %acc, <8 x float> addrspace(1)* %c) !intel_reqd_sub_group_size !100 {
+entry:
+; CHECK-LABEL: @test_v8(
+; CHECK:         call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v4f32.v8i32(<8 x float> %acc, <4 x float> %a, <8 x i32> %{{.+}}, i32 10, i32 10, i32 8, i32 8, i1 false)
+  %call = call spir_func <8 x float> @_Z39intel_sub_group_tf32_tf32_matrix_mad_k8Dv4_fDv8_fS0_(<4 x float> %a, <8 x float> %b, <8 x float> %acc)
+  %arrayidx = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %c, i64 0
+  store <8 x float> %call, <8 x float> addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+!100 = !{i32 16}

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ namespace IGC`
`36`	`36`	`{`
`37`	`37`	`PRECISION_UNUSED, U8, U4, U2, S8, S4, S2,`
`38`	`38`	`BF8,`
`39`		`- TF32,`
	`39`	`+ TF32 = 10,`
`40`	`40`	`BF16, FP16`
`41`	`41`	`};`
`42`	`42`