-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Prefer use of 0.0 over -0.0 for fadd reductions w/nsz (in IR) #106770
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This is a follow up to 924907b, and is mostly motivated by consistency but does include one additional optimization. In general, we prefer 0.0 over -0.0 as the identity value for an fadd. We use that value in several places, but don't in others. So, let's be consistent and use the same identity (when nsz allows) everywhere. This creates a bunch of test churn, but due to 924907b, most of that churn doesn't actually indicate a change in codegen. The exception is that this change enables the use of 0.0 for nsz, but *not* reasoc, fadd reductions. Or said differently, it allows the neutral value of an ordered fadd reduction to be 0.0.
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-amdgpu Author: Philip Reames (preames) ChangesThis is a follow up to 924907b, and is mostly motivated by consistency but does include one additional optimization. In general, we prefer 0.0 over -0.0 as the identity value for an fadd. We use that value in several places, but don't in others. So, let's be consistent and use the same identity (when nsz allows) everywhere. This creates a bunch of test churn, but due to 924907b, most of that churn doesn't actually indicate a change in codegen. The exception is that this change enables the use of 0.0 for nsz, but not reasoc, fadd reductions. Or said differently, it allows the neutral value of an ordered fadd reduction to be 0.0. Patch is 73.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106770.diff 36 Files Affected:
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 675d88d6d38cd9..880ee312e6c69d 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -407,7 +407,9 @@ static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
APFloat::getLargest(Semantics, Negative));
}
case Intrinsic::vp_reduce_fadd:
- return ConstantFP::getNegativeZero(EltTy);
+ return ConstantExpr::getBinOpIdentity(
+ Instruction::FAdd, EltTy, false,
+ VPI.getFastMathFlags().noSignedZeros());
case Intrinsic::vp_reduce_fmul:
return ConstantFP::get(EltTy, 1.0);
}
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index a49d3b0b990bc7..30ed3e70c2e8a6 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1226,9 +1226,12 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
case RecurKind::FMaximum:
return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src);
case RecurKind::FMulAdd:
- case RecurKind::FAdd:
- return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
- Src);
+ case RecurKind::FAdd: {
+ bool NSZ = Builder.getFastMathFlags().noSignedZeros();
+ auto *Start = ConstantExpr::getBinOpIdentity(Instruction::FAdd, SrcVecEltTy,
+ false, NSZ);
+ return Builder.CreateFAddReduce(Start, Src);
+ }
case RecurKind::FMul:
return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src);
default:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
index 23160c13b821d5..35db1131df6a9d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
@@ -29,7 +29,7 @@ define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
; CHECK-NEXT: [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP12]])
+; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP12]])
; CHECK-NEXT: [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index fc7f81fe12807f..d707002c18c857 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -202,7 +202,7 @@ define float @fadd_fast(ptr noalias nocapture readonly %a, i64 %n) {
; CHECK: %[[ADD2:.*]] = fadd fast <vscale x 8 x float> %[[LOAD2]]
; CHECK: middle.block:
; CHECK: %[[ADD:.*]] = fadd fast <vscale x 8 x float> %[[ADD2]], %[[ADD1]]
-; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[ADD]])
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> %[[ADD]])
entry:
br label %for.body
@@ -231,7 +231,7 @@ define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) {
; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]]
; CHECK: middle.block:
; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]]
-; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]])
+; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> %[[RDX]])
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 852a967e764819..a57eb069e5ca32 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -821,7 +821,7 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP7]])
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
index cf4d65318b7e5e..7dd0f0c0ad8e0e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -93,7 +93,7 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-NOTF: %[[LOAD:.*]] = load <vscale x 4 x float>
; CHECK-NOTF: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
; CHECK-NOTF: middle.block:
-; CHECK-NOTF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
+; CHECK-NOTF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %[[ADD]])
; CHECK-TF-NORED-LABEL: @fadd_red_fast
; CHECK-TF-NORED: vector.body:
@@ -101,7 +101,7 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-TF-NORED: %[[LOAD:.*]] = load <vscale x 4 x float>
; CHECK-TF-NORED: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
; CHECK-TF-NORED: middle.block:
-; CHECK-TF-NORED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
+; CHECK-TF-NORED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %[[ADD]])
; CHECK-TF-NOREC-LABEL: @fadd_red_fast
; CHECK-TF-NOREC: vector.body:
@@ -111,7 +111,7 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-TF-NOREC: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
; CHECK-TF-NOREC: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
; CHECK-TF-NOREC: middle.block:
-; CHECK-TF-NOREC-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+; CHECK-TF-NOREC-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %[[SEL]])
; CHECK-TF-NOREV-LABEL: @fadd_red_fast
; CHECK-TF-NOREV: vector.body:
@@ -121,7 +121,7 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-TF-NOREV: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
; CHECK-TF-NOREV: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
; CHECK-TF-NOREV: middle.block:
-; CHECK-TF-NOREV-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+; CHECK-TF-NOREV-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %[[SEL]])
; CHECK-TF-LABEL: @fadd_red_fast
; CHECK-TF: vector.body:
@@ -131,7 +131,7 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-TF: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
; CHECK-TF: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
; CHECK-TF: middle.block:
-; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %[[SEL]])
; CHECK-TF-ONLYRED-LABEL: @fadd_red_fast
; CHECK-TF-ONLYRED: vector.body:
@@ -141,7 +141,7 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-TF-ONLYRED: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
; CHECK-TF-ONLYRED: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
; CHECK-TF-ONLYRED: middle.block:
-; CHECK-TF-ONLYRED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+; CHECK-TF-ONLYRED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %[[SEL]])
; CHECK-NEOVERSE-V1-LABEL: @fadd_red_fast
; CHECK-NEOVERSE-V1: vector.body:
@@ -149,7 +149,7 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-NEOVERSE-V1: %[[LOAD:.*]] = load <vscale x 4 x float>
; CHECK-NEOVERSE-V1: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
; CHECK-NEOVERSE-V1: middle.block:
-; CHECK-NEOVERSE-V1-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
+; CHECK-NEOVERSE-V1-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %[[ADD]])
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
index 8f768d959d8b1e..17da8f314b0e74 100644
--- a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
@@ -24,7 +24,7 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
; GFX9-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; GFX9: middle.block:
; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]]
-; GFX9-NEXT: [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX]])
+; GFX9-NEXT: [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[BIN_RDX]])
; GFX9-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; GFX9: scalar.ph:
; GFX9-NEXT: br label [[FOR_BODY:%.*]]
@@ -54,7 +54,7 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
; VI-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; VI: middle.block:
; VI-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]]
-; VI-NEXT: [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX]])
+; VI-NEXT: [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[BIN_RDX]])
; VI-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; VI: scalar.ph:
; VI-NEXT: br label [[FOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
index 18fc7c12691a6f..1371d80b00cadd 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -336,7 +336,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP4]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
index ef31bf336088ba..b5ae9700217eae 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -531,7 +531,7 @@ define float @fadd_f32(ptr nocapture readonly %x, i32 %n) #0 {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP4]])
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index d904c50f3bf9cd..fc56754166d609 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -51,7 +51,7 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI]])
+; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
index 45e84eb409de2b..e9331be2536843 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
@@ -64,7 +64,7 @@ define i32 @test(ptr nocapture readonly %x) {
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP15:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP13]])
+; CHECK-NEXT: [[TMP15:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP13]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
index 60619f5de09372..9f13a461b2057c 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
@@ -20,7 +20,7 @@ define dso_local double @test(ptr %Arr) {
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP3]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
+; CHECK-NEXT: [[TMP5:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[DOTLCSSA]])
; CHECK-NEXT: ret double [[TMP5]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll
index e3af831f83c970..a7d2a62079d73b 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll
@@ -18,7 +18,7 @@ define dso_local double @test(ptr %Arr) {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP4]])
; CHECK-NEXT: ret double [[TMP6]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
index fb4b7d11851d1a..8e7cd7f6d530dd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
@@ -206,7 +206,7 @@ define float @fadd_fast(ptr noalias nocapture readonly %a, i64 %n) {
; CHECK: %[[ADD2:.*]] = fadd fast <vscale x 8 x float> %[[LOAD2]]
; CHECK: middle.block:
; CHECK: %[[ADD:.*]] = fadd fast <vscale x 8 x float> %[[ADD2]], %[[ADD1]]
-; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[ADD]])
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> %[[ADD]])
entry:
br label %for.body
@@ -235,7 +235,7 @@ define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) {
; CHECK: %[[FADD2:.*]] = fadd fast <16 x bfloat> %[[LOAD2]]
; CHECK: middle.block:
; CHECK: %[[RDX:.*]] = fadd fast <16 x bfloat> %[[FADD2]], %[[FADD1]]
-; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR8000, <16 x bfloat> %[[RDX]])
+; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> %[[RDX]])
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 8e36890dae16e3..8877e65a0d9d73 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -215,7 +215,7 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP120]], [[TMP119]]
; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[TMP121]], [[BIN_RDX]]
; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[TMP122]], [[BIN_RDX4]]
-; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX5]])
+; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX5]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHE...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…6ee5fcb79 Local branch amd-gfx 5856ee5 Merged main:8e5b43c8effc0a01745bb7c53ca21fb6c8384c51 into amd-gfx:3cfe7da32635 Remote branch main 2c7786e Prefer use of 0.0 over -0.0 for fadd reductions w/nsz (in IR) (llvm#106770)
This is a follow up to 924907b, and is mostly motivated by consistency but does include one additional optimization. In general, we prefer 0.0 over -0.0 as the identity value for an fadd. We use that value in several places, but don't in others. So, let's be consistent and use the same identity (when nsz allows) everywhere.
This creates a bunch of test churn, but due to 924907b, most of that churn doesn't actually indicate a change in codegen. The exception is that this change enables the use of 0.0 for nsz, but not reasoc, fadd reductions. Or said differently, it allows the neutral value of an ordered fadd reduction to be 0.0.