[SDAG][RISCV] Don't promote VP_REDUCE_{FADD,FMUL} (#111000)

lukel97 · web-flow · commit 487686b82e9a · 2024-10-04T00:17:45.000+08:00
In https://reviews.llvm.org/D153848, promotion was added for a variety of f16 ops with zvfhmin, including VP reductions. However I don't believe it's correct to promote f16 fadd or fmul reductions to f32 since we need to round the intermediate results. Today if we lower @llvm.vp.reduce.fadd.nxv1f16 on RISC-V, we'll get two different results depending on whether we compiled with +zvfh or +zvfhmin, for example with a 3 element reduction: ; v9 = [0.1563, 5.97e-8, 0.00006104] ; zvfh vsetivli x0, 3, e16, m1, ta, ma vmv.v.i v8, 0 vfredosum.vs v8, v9, v8 vfmv.f.s fa0, v8 ; fa0 = 0.1563 ; zvfhmin vsetivli x0, 3, e16, m1, ta, ma vfwcvt.f.f.v v10, v9 vsetivli x0, 3, e32, m1, ta, ma vmv.v.i v8, 0 vfredosum.vs v8, v10, v8 vfmv.f.s fa0, v8 fcvt.h.s fa0, fa0 ; fa0 = 0.1564 This same thing happens with reassociative reductions e.g. vfredusum.vs, and this also applies for bf16. I couldn't find anything in the LangRef for reductions that suggest the excess precision is allowed. There may be something we can do in Clang with -fexcess-precision=fast, but I haven't looked into this yet. I presume the same precision issue occurs with fmul, but not with fmin/fmax/fminimum/fmaximum. I can't think of another way of lowering these other than scalarizing, and we can't scalarize scalable vectors, so this just removes the promotion and adjusts the cost model to return an invalid cost. (It looks like we also don't currently cost fmul reductions, so presumably they also have an invalid cost?) I think this should be enough to stop the loop vectorizer or SLP from emitting these intrinsics.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -5851,13 +5851,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
                     DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)));
     break;
   }
-  case ISD::VP_REDUCE_FADD:
-  case ISD::VP_REDUCE_FMUL:
   case ISD::VP_REDUCE_FMAX:
   case ISD::VP_REDUCE_FMIN:
   case ISD::VP_REDUCE_FMAXIMUM:
   case ISD::VP_REDUCE_FMINIMUM:
-  case ISD::VP_REDUCE_SEQ_FADD:
     Results.push_back(PromoteReduction(Node));
     break;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -957,8 +957,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FMUL,
         ISD::VP_FDIV,
         ISD::VP_FMA,
-        ISD::VP_REDUCE_FADD,
-        ISD::VP_REDUCE_SEQ_FADD,
         ISD::VP_REDUCE_FMIN,
         ISD::VP_REDUCE_FMAX,
         ISD::VP_SQRT,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1531,6 +1531,11 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
     Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
     break;
   case ISD::FADD:
+    // We can't promote f16/bf16 fadd reductions.
+    if ((LT.second.getVectorElementType() == MVT::f16 &&
+         !ST->hasVInstructionsF16()) ||
+        LT.second.getVectorElementType() == MVT::bf16)
+      return InstructionCost::getInvalid();
     SplitOp = RISCV::VFADD_VV;
     Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
     break;
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll
@@ -1,18 +1,30 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefix=FP-REDUCE
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFH
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFHMIN
 ; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s  --check-prefix=SIZE
 
 define void @reduce_fadd_half() {
-; FP-REDUCE-LABEL: 'reduce_fadd_half'
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; FP-REDUCE-ZVFH-LABEL: 'reduce_fadd_half'
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; FP-REDUCE-ZVFHMIN-LABEL: 'reduce_fadd_half'
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V4 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V8 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %v32 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V64 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V128 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_fadd_half'
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -1,117 +1,61 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \
-; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \
-; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
 
 declare half @llvm.vp.reduce.fadd.v2f16(half, <2 x half>, <2 x i1>, i32)
 
 define half @vpreduce_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vpreduce_fadd_v2f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFH-NEXT:    vfmv.s.f v9, fa0
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
-; ZVFH-NEXT:    vfmv.f.s fa0, v9
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vpreduce_fadd_v2f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfredusum.vs v8, v9, v8, v0.t
-; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
-; ZVFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vpreduce_fadd_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v9, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfredusum.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v9
+; CHECK-NEXT:    ret
   %r = call reassoc half @llvm.vp.reduce.fadd.v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 %evl)
   ret half %r
 }
 
 define half @vpreduce_ord_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vpreduce_ord_fadd_v2f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFH-NEXT:    vfmv.s.f v9, fa0
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
-; ZVFH-NEXT:    vfmv.f.s fa0, v9
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vpreduce_ord_fadd_v2f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfredosum.vs v8, v9, v8, v0.t
-; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
-; ZVFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vpreduce_ord_fadd_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v9, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfredosum.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v9
+; CHECK-NEXT:    ret
   %r = call half @llvm.vp.reduce.fadd.v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 %evl)
   ret half %r
 }
 
 declare half @llvm.vp.reduce.fadd.v4f16(half, <4 x half>, <4 x i1>, i32)
 
 define half @vpreduce_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vpreduce_fadd_v4f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFH-NEXT:    vfmv.s.f v9, fa0
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
-; ZVFH-NEXT:    vfmv.f.s fa0, v9
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vpreduce_fadd_v4f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfredusum.vs v8, v9, v8, v0.t
-; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
-; ZVFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vpreduce_fadd_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v9, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfredusum.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v9
+; CHECK-NEXT:    ret
   %r = call reassoc half @llvm.vp.reduce.fadd.v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 %evl)
   ret half %r
 }
 
 define half @vpreduce_ord_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vpreduce_ord_fadd_v4f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFH-NEXT:    vfmv.s.f v9, fa0
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
-; ZVFH-NEXT:    vfmv.f.s fa0, v9
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vpreduce_ord_fadd_v4f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfredosum.vs v8, v9, v8, v0.t
-; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
-; ZVFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vpreduce_ord_fadd_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v9, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfredosum.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v9
+; CHECK-NEXT:    ret
   %r = call half @llvm.vp.reduce.fadd.v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 %evl)
   ret half %r
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll

Original file line number	Diff line number	Diff line change
`@@ -5851,13 +5851,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {`
`5851`	`5851`	`DAG.getIntPtrConstant(0, dl, /isTarget=/true)));`
`5852`	`5852`	`break;`
`5853`	`5853`	`}`
`5854`		`- case ISD::VP_REDUCE_FADD:`
`5855`		`- case ISD::VP_REDUCE_FMUL:`
`5856`	`5854`	`case ISD::VP_REDUCE_FMAX:`
`5857`	`5855`	`case ISD::VP_REDUCE_FMIN:`
`5858`	`5856`	`case ISD::VP_REDUCE_FMAXIMUM:`
`5859`	`5857`	`case ISD::VP_REDUCE_FMINIMUM:`
`5860`		`- case ISD::VP_REDUCE_SEQ_FADD:`
`5861`	`5858`	`Results.push_back(PromoteReduction(Node));`
`5862`	`5859`	`break;`
`5863`	`5860`	`}`