[RISCV] Add scalable vector patterns for vfwmaccbf16.v{v,f} (#106771)

lukel97 · web-flow · commit 8e972efb58ec · 2024-08-31T13:20:19.000+08:00
We can reuse the patterns for vfwmacc.v{v,f} as long as we swap out
fpext_oneuse for riscv_fpextend_bf16 in the scalar case.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -676,13 +676,18 @@ multiclass VPatWidenBinaryFPSDNode_VV_VF_WV_WF_RM<SDNode op,
     : VPatWidenBinaryFPSDNode_VV_VF_RM<op, instruction_name>,
       VPatWidenBinaryFPSDNode_WV_WF_RM<op, instruction_name>;
 
-multiclass VPatWidenFPMulAccSDNode_VV_VF_RM<string instruction_name> {
-  foreach vtiToWti = AllWidenableFloatVectors in {
+multiclass VPatWidenFPMulAccSDNode_VV_VF_RM<string instruction_name,
+                                            list <VTypeInfoToWide> vtiToWtis,
+                                            PatFrags extop> {
+  foreach vtiToWti = vtiToWtis in {
     defvar vti = vtiToWti.Vti;
     defvar wti = vtiToWti.Wti;
     defvar suffix = vti.LMul.MX # "_E" # vti.SEW;
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
-                                 GetVTypePredicates<wti>.Predicates) in {
+                                 GetVTypePredicates<wti>.Predicates,
+                                 !if(!eq(vti.Scalar, bf16),
+                                     [HasStdExtZvfbfwma],
+                                     [])) in {
       def : Pat<(fma (wti.Vector (riscv_fpextend_vl_oneuse
                                       (vti.Vector vti.RegClass:$rs1),
                                       (vti.Mask true_mask), (XLenVT srcvalue))),
@@ -697,7 +702,7 @@ multiclass VPatWidenFPMulAccSDNode_VV_VF_RM<string instruction_name> {
                    FRM_DYN,
                    vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
       def : Pat<(fma (wti.Vector (SplatFPOp
-                                      (fpext_oneuse (vti.Scalar vti.ScalarRegClass:$rs1)))),
+                                      (extop (vti.Scalar vti.ScalarRegClass:$rs1)))),
                      (wti.Vector (riscv_fpextend_vl_oneuse
                                       (vti.Vector vti.RegClass:$rs2),
                                       (vti.Mask true_mask), (XLenVT srcvalue))),
@@ -1284,7 +1289,12 @@ foreach fvti = AllFloatVectors in {
 }
 
 // 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
-defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACC">;
+defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACC",
+                                        AllWidenableFloatVectors,
+                                        fpext_oneuse>;
+defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16",
+                                        AllWidenableBFloatToFloatVectors,
+                                        riscv_fpextend_bf16_oneuse>;
 defm : VPatWidenFPNegMulAccSDNode_VV_VF_RM<"PseudoVFWNMACC">;
 defm : VPatWidenFPMulSacSDNode_VV_VF_RM<"PseudoVFWMSAC">;
 defm : VPatWidenFPNegMulSacSDNode_VV_VF_RM<"PseudoVFWNMSAC">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td
@@ -26,6 +26,10 @@ def riscv_fpround_bf16
     : SDNode<"RISCVISD::FP_ROUND_BF16", SDT_RISCVFP_ROUND_BF16>;
 def riscv_fpextend_bf16
     : SDNode<"RISCVISD::FP_EXTEND_BF16", SDT_RISCVFP_EXTEND_BF16>;
+def riscv_fpextend_bf16_oneuse : PatFrag<(ops node:$A),
+                                         (riscv_fpextend_bf16 node:$A), [{
+  return N->hasOneUse();
+}]>;
 
 //===----------------------------------------------------------------------===//
 // Instructions
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmaccbf16-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmaccbf16-sdnode.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN
+
+define <vscale x 1 x float> @vfwmaccbf16_vv_nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x bfloat> %b, <vscale x 1 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vv_nxv1f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vv v8, v9, v10
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vv_nxv1f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v11, v9
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v9, v10
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vv v8, v11, v9
+; ZVFBFMIN-NEXT:    ret
+  %b.ext = fpext <vscale x 1 x bfloat> %b to <vscale x 1 x float>
+  %c.ext = fpext <vscale x 1 x bfloat> %c to <vscale x 1 x float>
+  %res = call <vscale x 1 x float> @llvm.fma.nxv1f32(<vscale x 1 x float> %b.ext, <vscale x 1 x float> %c.ext, <vscale x 1 x float> %a)
+  ret <vscale x 1 x float> %res
+}
+
+define <vscale x 1 x float> @vfwmaccbf16_vf_nxv1f32(<vscale x 1 x float> %a, bfloat %b, <vscale x 1 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vf_nxv1f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vf v8, fa0, v9
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vf_nxv1f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    slli a0, a0, 16
+; ZVFBFMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vf v8, fa5, v10
+; ZVFBFMIN-NEXT:    ret
+  %b.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+  %b.splat = shufflevector <vscale x 1 x bfloat> %b.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+  %b.ext = fpext <vscale x 1 x bfloat> %b.splat to <vscale x 1 x float>
+  %c.ext = fpext <vscale x 1 x bfloat> %c to <vscale x 1 x float>
+  %res = call <vscale x 1 x float> @llvm.fma.nxv1f32(<vscale x 1 x float> %b.ext, <vscale x 1 x float> %c.ext, <vscale x 1 x float> %a)
+  ret <vscale x 1 x float> %res
+}
+
+define <vscale x 2 x float> @vfwmaccbf16_vv_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vv_nxv2f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vv v8, v9, v10
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vv_nxv2f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v11, v9
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v9, v10
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vv v8, v11, v9
+; ZVFBFMIN-NEXT:    ret
+  %b.ext = fpext <vscale x 2 x bfloat> %b to <vscale x 2 x float>
+  %c.ext = fpext <vscale x 2 x bfloat> %c to <vscale x 2 x float>
+  %res = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> %b.ext, <vscale x 2 x float> %c.ext, <vscale x 2 x float> %a)
+  ret <vscale x 2 x float> %res
+}
+
+define <vscale x 2 x float> @vfwmaccbf16_vf_nxv2f32(<vscale x 2 x float> %a, bfloat %b, <vscale x 2 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vf_nxv2f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vf v8, fa0, v9
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vf_nxv2f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    slli a0, a0, 16
+; ZVFBFMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vf v8, fa5, v10
+; ZVFBFMIN-NEXT:    ret
+  %b.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+  %b.splat = shufflevector <vscale x 2 x bfloat> %b.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+  %b.ext = fpext <vscale x 2 x bfloat> %b.splat to <vscale x 2 x float>
+  %c.ext = fpext <vscale x 2 x bfloat> %c to <vscale x 2 x float>
+  %res = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> %b.ext, <vscale x 2 x float> %c.ext, <vscale x 2 x float> %a)
+  ret <vscale x 2 x float> %res
+}
+
+define <vscale x 4 x float> @vfwmaccbf16_vv_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vv_nxv4f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vv v8, v10, v11
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vv_nxv4f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v14, v11
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vv v8, v12, v14
+; ZVFBFMIN-NEXT:    ret
+  %b.ext = fpext <vscale x 4 x bfloat> %b to <vscale x 4 x float>
+  %c.ext = fpext <vscale x 4 x bfloat> %c to <vscale x 4 x float>
+  %res = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> %b.ext, <vscale x 4 x float> %c.ext, <vscale x 4 x float> %a)
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 4 x float> @vfwmaccbf16_vf_nxv4f32(<vscale x 4 x float> %a, bfloat %b, <vscale x 4 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vf_nxv4f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vf v8, fa0, v10
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vf_nxv4f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    slli a0, a0, 16
+; ZVFBFMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vf v8, fa5, v12
+; ZVFBFMIN-NEXT:    ret
+  %b.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+  %b.splat = shufflevector <vscale x 4 x bfloat> %b.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+  %b.ext = fpext <vscale x 4 x bfloat> %b.splat to <vscale x 4 x float>
+  %c.ext = fpext <vscale x 4 x bfloat> %c to <vscale x 4 x float>
+  %res = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> %b.ext, <vscale x 4 x float> %c.ext, <vscale x 4 x float> %a)
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x float> @vfwmaccbf16_vv_nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vv_nxv8f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vv v8, v12, v14
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vv_nxv8f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v20, v14
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vv v8, v16, v20
+; ZVFBFMIN-NEXT:    ret
+  %b.ext = fpext <vscale x 8 x bfloat> %b to <vscale x 8 x float>
+  %c.ext = fpext <vscale x 8 x bfloat> %c to <vscale x 8 x float>
+  %res = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> %b.ext, <vscale x 8 x float> %c.ext, <vscale x 8 x float> %a)
+  ret <vscale x 8 x float> %res
+}
+
+define <vscale x 8 x float> @vfwmaccbf16_vf_nxv8f32(<vscale x 8 x float> %a, bfloat %b, <vscale x 8 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vf_nxv8f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vf v8, fa0, v12
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vf_nxv8f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    slli a0, a0, 16
+; ZVFBFMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vf v8, fa5, v16
+; ZVFBFMIN-NEXT:    ret
+  %b.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+  %b.splat = shufflevector <vscale x 8 x bfloat> %b.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+  %b.ext = fpext <vscale x 8 x bfloat> %b.splat to <vscale x 8 x float>
+  %c.ext = fpext <vscale x 8 x bfloat> %c to <vscale x 8 x float>
+  %res = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> %b.ext, <vscale x 8 x float> %c.ext, <vscale x 8 x float> %a)
+  ret <vscale x 8 x float> %res
+}
+
+define <vscale x 16 x float> @vfwmaccbf16_vv_nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x bfloat> %b, <vscale x 16 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vv_nxv16f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vv v8, v16, v20
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vv_nxv16f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v0, v20
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vv v8, v24, v0
+; ZVFBFMIN-NEXT:    ret
+  %b.ext = fpext <vscale x 16 x bfloat> %b to <vscale x 16 x float>
+  %c.ext = fpext <vscale x 16 x bfloat> %c to <vscale x 16 x float>
+  %res = call <vscale x 16 x float> @llvm.fma.nxv16f32(<vscale x 16 x float> %b.ext, <vscale x 16 x float> %c.ext, <vscale x 16 x float> %a)
+  ret <vscale x 16 x float> %res
+}
+
+define <vscale x 16 x float> @vfwmaccbf16_vf_nxv16f32(<vscale x 16 x float> %a, bfloat %b, <vscale x 16 x bfloat> %c) {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vf_nxv16f32:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vf v8, fa0, v16
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vf_nxv16f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    slli a0, a0, 16
+; ZVFBFMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vf v8, fa5, v24
+; ZVFBFMIN-NEXT:    ret
+  %b.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+  %b.splat = shufflevector <vscale x 16 x bfloat> %b.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+  %b.ext = fpext <vscale x 16 x bfloat> %b.splat to <vscale x 16 x float>
+  %c.ext = fpext <vscale x 16 x bfloat> %c to <vscale x 16 x float>
+  %res = call <vscale x 16 x float> @llvm.fma.nxv16f32(<vscale x 16 x float> %b.ext, <vscale x 16 x float> %c.ext, <vscale x 16 x float> %a)
+  ret <vscale x 16 x float> %res
+}