Skip to content

Commit cb3ce30

Browse files
authored
[VPlan] Fold NOT into predicate of wide compares. (#129430)
Add simplification to fold negation into a compare, if the negation is the only user of the compare. This removes a number of redundant negations. Alive2 Proofs for FPCMP test changes: https://alive2.llvm.org/ce/z/WGDz9U PR: #129430
1 parent 1b01f05 commit cb3ce30

28 files changed

+241
-381
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,9 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
459459
/// Returns the debug location of the recipe.
460460
DebugLoc getDebugLoc() const { return DL; }
461461

462+
/// Set the recipe's debug location to \p NewDL.
463+
void setDebugLoc(DebugLoc NewDL) { DL = NewDL; }
464+
462465
protected:
463466
/// Compute the cost of this recipe either using a recipe's specialized
464467
/// implementation or using the legacy cost model and the underlying
@@ -794,6 +797,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
794797
return CmpPredicate;
795798
}
796799

800+
void setPredicate(CmpInst::Predicate Pred) {
801+
assert(OpType == OperationType::Cmp &&
802+
"recipe doesn't have a compare predicate");
803+
CmpPredicate = Pred;
804+
}
805+
797806
GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; }
798807

799808
/// Returns true if the recipe has fast-math flags.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -972,8 +972,25 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
972972
if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
973973
return R.getVPSingleValue()->replaceAllUsesWith(A);
974974

975-
if (match(&R, m_Not(m_Not(m_VPValue(A)))))
976-
return R.getVPSingleValue()->replaceAllUsesWith(A);
975+
if (match(&R, m_Not(m_VPValue(A)))) {
976+
if (match(A, m_Not(m_VPValue(A))))
977+
return R.getVPSingleValue()->replaceAllUsesWith(A);
978+
979+
// Try to fold Not into compares by adjusting the predicate in-place.
980+
if (isa<VPWidenRecipe>(A) && A->getNumUsers() == 1) {
981+
auto *WideCmp = cast<VPWidenRecipe>(A);
982+
if (WideCmp->getOpcode() == Instruction::ICmp ||
983+
WideCmp->getOpcode() == Instruction::FCmp) {
984+
WideCmp->setPredicate(
985+
CmpInst::getInversePredicate(WideCmp->getPredicate()));
986+
R.getVPSingleValue()->replaceAllUsesWith(WideCmp);
987+
// If WideCmp doesn't have a debug location, use the one from the
988+
// negation, to preserve the location.
989+
if (!WideCmp->getDebugLoc() && R.getDebugLoc())
990+
WideCmp->setDebugLoc(R.getDebugLoc());
991+
}
992+
}
993+
}
977994

978995
// Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
979996
if ((match(&R,

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,8 +1037,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
10371037
; TFALWAYS-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
10381038
; TFALWAYS-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
10391039
; TFALWAYS-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
1040-
; TFALWAYS-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
1041-
; TFALWAYS-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
1040+
; TFALWAYS-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer
10421041
; TFALWAYS-NEXT: [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
10431042
; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
10441043
; TFALWAYS-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
@@ -1082,8 +1081,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
10821081
; TFFALLBACK-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
10831082
; TFFALLBACK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
10841083
; TFFALLBACK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
1085-
; TFFALLBACK-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
1086-
; TFFALLBACK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
1084+
; TFFALLBACK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer
10871085
; TFFALLBACK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
10881086
; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
10891087
; TFFALLBACK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
@@ -1133,10 +1131,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
11331131
; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
11341132
; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0
11351133
; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP10]], i32 1
1136-
; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
1137-
; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer
1138-
; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor <2 x i1> [[TMP13]], splat (i1 true)
1139-
; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP14]], splat (i1 true)
1134+
; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer
1135+
; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = fcmp ule <2 x double> [[TMP12]], zeroinitializer
11401136
; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer
11411137
; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK2]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
11421138
; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer

llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
target triple = "aarch64-unknown-linux-gnu"
66

77
; CHECK-COST: Checking a loop in 'fixed_width'
8-
; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5>
9-
; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5>
8+
; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<{{.+}}>, ir<2>, ir<{{.+}}>
9+
; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<{{.+}}>, ir<2>, ir<{{.+}}>
1010
; CHECK-COST: Selecting VF: 1.
1111

1212
; We should decide this loop is not worth vectorising using fixed width vectors

llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,8 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
4343
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP10]]
4444
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
4545
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
46-
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
46+
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
4747
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
48-
; CHECK-NEXT: [[TMP16:%.*]] = xor <vscale x 16 x i1> [[TMP15]], splat (i1 true)
4948
; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
5049
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
5150
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -126,9 +125,8 @@ define i64 @same_exit_block_pre_inc_use4() {
126125
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[TMP0]]
127126
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
128127
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 1
129-
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <2 x i64> [[VEC_IND]], [[WIDE_LOAD]]
128+
; CHECK-NEXT: [[TMP4:%.*]] = icmp uge <2 x i64> [[VEC_IND]], [[WIDE_LOAD]]
130129
; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2
131-
; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
132130
; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
133131
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
134132
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
@@ -204,9 +202,8 @@ define i64 @loop_contains_safe_call() #1 {
204202
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
205203
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 1
206204
; CHECK-NEXT: [[TMP3:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]])
207-
; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast ult <4 x float> [[TMP3]], splat (float 3.000000e+00)
205+
; CHECK-NEXT: [[TMP5:%.*]] = fcmp fast oge <4 x float> [[TMP3]], splat (float 3.000000e+00)
208206
; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
209-
; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
210207
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
211208
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
212209
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
@@ -299,9 +296,8 @@ define i64 @loop_contains_safe_div() #1 {
299296
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
300297
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP2]], align 1
301298
; CHECK-NEXT: [[TMP13:%.*]] = udiv <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 20000)
302-
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <vscale x 4 x i32> [[TMP13]], splat (i32 1)
299+
; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <vscale x 4 x i32> [[TMP13]], splat (i32 1)
303300
; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX2]], [[TMP5]]
304-
; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[TMP14]], splat (i1 true)
305301
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
306302
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[INDEX1]]
307303
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -381,12 +377,11 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
381377
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]]
382378
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
383379
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
384-
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 1)
380+
; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 1)
385381
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P2]], i64 [[TMP0]]
386382
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
387383
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
388384
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
389-
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
390385
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
391386
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
392387
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)

llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
1010
; CHECK-VF4IC1: vector.body:
1111
; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
1212
; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
13-
; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], splat (i32 3)
14-
; CHECK-VF4IC1-NEXT: [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], splat (i1 true)
15-
; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
13+
; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_LOAD]], splat (i32 3)
14+
; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
1615
; CHECK-VF4IC1: middle.block:
1716
; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
1817
; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]]
@@ -25,18 +24,14 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
2524
; CHECK-VF4IC4: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
2625
; CHECK-VF4IC4: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
2726
; CHECK-VF4IC4: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
28-
; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, splat (i32 3)
29-
; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, splat (i32 3)
30-
; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, splat (i32 3)
31-
; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, splat (i32 3)
32-
; CHECK-VF4IC4-NEXT: [[NOT1:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP1]], splat (i1 true)
33-
; CHECK-VF4IC4-NEXT: [[NOT2:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP2]], splat (i1 true)
34-
; CHECK-VF4IC4-NEXT: [[NOT3:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP3]], splat (i1 true)
35-
; CHECK-VF4IC4-NEXT: [[NOT4:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP4]], splat (i1 true)
36-
; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = or <vscale x 4 x i1> [[VEC_PHI1]], [[NOT1]]
37-
; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = or <vscale x 4 x i1> [[VEC_PHI2]], [[NOT2]]
38-
; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = or <vscale x 4 x i1> [[VEC_PHI3]], [[NOT3]]
39-
; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = or <vscale x 4 x i1> [[VEC_PHI4]], [[NOT4]]
27+
; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp ne <vscale x 4 x i32> {{.*}}, splat (i32 3)
28+
; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp ne <vscale x 4 x i32> {{.*}}, splat (i32 3)
29+
; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp ne <vscale x 4 x i32> {{.*}}, splat (i32 3)
30+
; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp ne <vscale x 4 x i32> {{.*}}, splat (i32 3)
31+
; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = or <vscale x 4 x i1> [[VEC_PHI1]], [[VEC_ICMP1]]
32+
; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = or <vscale x 4 x i1> [[VEC_PHI2]], [[VEC_ICMP2]]
33+
; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = or <vscale x 4 x i1> [[VEC_PHI3]], [[VEC_ICMP3]]
34+
; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = or <vscale x 4 x i1> [[VEC_PHI4]], [[VEC_ICMP4]]
4035
; CHECK-VF4IC4: middle.block:
4136
; CHECK-VF4IC4-NEXT: [[OR1:%.*]] = or <vscale x 4 x i1> [[VEC_SEL2]], [[VEC_SEL1]]
4237
; CHECK-VF4IC4-NEXT: [[OR2:%.*]] = or <vscale x 4 x i1> [[VEC_SEL3]], [[OR1]]
@@ -71,9 +66,8 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64
7166
; CHECK-VF4IC1: vector.body:
7267
; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
7368
; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
74-
; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], splat (i32 3)
75-
; CHECK-VF4IC1-NEXT: [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], splat (i1 true)
76-
; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
69+
; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_LOAD]], splat (i32 3)
70+
; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
7771
; CHECK-VF4IC1: middle.block:
7872
; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
7973
; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]]
@@ -105,9 +99,8 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
10599
; CHECK-VF4IC1: vector.body:
106100
; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
107101
; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <vscale x 4 x float>
108-
; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = fcmp fast ueq <vscale x 4 x float> [[VEC_LOAD]], splat (float 3.000000e+00)
109-
; CHECK-VF4IC1-NEXT: [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], splat (i1 true)
110-
; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
102+
; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = fcmp fast one <vscale x 4 x float> [[VEC_LOAD]], splat (float 3.000000e+00)
103+
; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
111104
; CHECK-VF4IC1: middle.block:
112105
; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
113106
; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]]

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -460,8 +460,7 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
460460
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP10]]
461461
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
462462
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
463-
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
464-
; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP13]], splat (i1 true)
463+
; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
465464
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
466465
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
467466
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32> zeroinitializer

llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,8 @@ define void @cond_uniform_load(ptr nocapture %dst, ptr nocapture readonly %src,
6262
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ]
6363
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0
6464
; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{%.*}}, i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
65-
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer
66-
; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
67-
; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
65+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[COND_LOAD]], zeroinitializer
66+
; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
6867
; CHECK-NEXT: call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> poison)
6968
entry:
7069
br label %for.body

0 commit comments

Comments
 (0)