Skip to content

Commit 12fb133

Browse files
[LoopVectorize] Support conditional in-loop vector reductions
Extends getReductionOpChain to look through Phis which may be part of the reduction chain. adjustRecipesForReductions will now also create a CondOp for VPReductionRecipe if the block is predicated and not only if foldTailByMasking is true. Changes were required in tryToBlend to ensure that we don't attempt to convert the reduction Phi into a select by returning a VPBlendRecipe. The VPReductionRecipe will create a select between the Phi and the reduction. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D117580
1 parent a2c267e commit 12fb133

File tree

6 files changed

+1257
-42
lines changed

6 files changed

+1257
-42
lines changed

llvm/lib/Analysis/IVDescriptors.cpp

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,7 +1058,7 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
10581058
// to check for a pair of icmp/select, for which we use getNextInstruction and
10591059
// isCorrectOpcode functions to step the right number of instruction, and
10601060
// check the icmp/select pair.
1061-
// FIXME: We also do not attempt to look through Phi/Select's yet, which might
1061+
// FIXME: We also do not attempt to look through Select's yet, which might
10621062
// be part of the reduction chain, or attempt to looks through And's to find a
10631063
// smaller bitwidth. Subs are also currently not allowed (which are usually
10641064
// treated as part of a add reduction) as they are expected to generally be
@@ -1068,16 +1068,21 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
10681068
if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp)
10691069
ExpectedUses = 2;
10701070

1071-
auto getNextInstruction = [&](Instruction *Cur) {
1072-
if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
1073-
// We are expecting a icmp/select pair, which we go to the next select
1074-
// instruction if we can. We already know that Cur has 2 uses.
1075-
if (isa<SelectInst>(*Cur->user_begin()))
1076-
return cast<Instruction>(*Cur->user_begin());
1077-
else
1078-
return cast<Instruction>(*std::next(Cur->user_begin()));
1071+
auto getNextInstruction = [&](Instruction *Cur) -> Instruction * {
1072+
for (auto User : Cur->users()) {
1073+
Instruction *UI = cast<Instruction>(User);
1074+
if (isa<PHINode>(UI))
1075+
continue;
1076+
if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
1077+
// We are expecting a icmp/select pair, which we go to the next select
1078+
// instruction if we can. We already know that Cur has 2 uses.
1079+
if (isa<SelectInst>(UI))
1080+
return UI;
1081+
continue;
1082+
}
1083+
return UI;
10791084
}
1080-
return cast<Instruction>(*Cur->user_begin());
1085+
return nullptr;
10811086
};
10821087
auto isCorrectOpcode = [&](Instruction *Cur) {
10831088
if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
@@ -1092,22 +1097,46 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
10921097
return Cur->getOpcode() == RedOp;
10931098
};
10941099

1100+
// Attempt to look through Phis which are part of the reduction chain
1101+
unsigned ExtraPhiUses = 0;
1102+
Instruction *RdxInstr = LoopExitInstr;
1103+
if (auto ExitPhi = dyn_cast<PHINode>(LoopExitInstr)) {
1104+
if (ExitPhi->getNumIncomingValues() != 2)
1105+
return {};
1106+
1107+
Instruction *Inc0 = dyn_cast<Instruction>(ExitPhi->getIncomingValue(0));
1108+
Instruction *Inc1 = dyn_cast<Instruction>(ExitPhi->getIncomingValue(1));
1109+
1110+
Instruction *Chain = nullptr;
1111+
if (Inc0 == Phi)
1112+
Chain = Inc1;
1113+
else if (Inc1 == Phi)
1114+
Chain = Inc0;
1115+
else
1116+
return {};
1117+
1118+
RdxInstr = Chain;
1119+
ExtraPhiUses = 1;
1120+
}
1121+
10951122
// The loop exit instruction we check first (as a quick test) but add last. We
10961123
// check the opcode is correct (and dont allow them to be Subs) and that they
10971124
// have expected to have the expected number of uses. They will have one use
10981125
// from the phi and one from a LCSSA value, no matter the type.
1099-
if (!isCorrectOpcode(LoopExitInstr) || !LoopExitInstr->hasNUses(2))
1126+
if (!isCorrectOpcode(RdxInstr) || !LoopExitInstr->hasNUses(2))
11001127
return {};
11011128

1102-
// Check that the Phi has one (or two for min/max) uses.
1103-
if (!Phi->hasNUses(ExpectedUses))
1129+
// Check that the Phi has one (or two for min/max) uses, plus an extra use
1130+
// for conditional reductions.
1131+
if (!Phi->hasNUses(ExpectedUses + ExtraPhiUses))
11041132
return {};
1133+
11051134
Instruction *Cur = getNextInstruction(Phi);
11061135

11071136
// Each other instruction in the chain should have the expected number of uses
11081137
// and be the correct opcode.
1109-
while (Cur != LoopExitInstr) {
1110-
if (!isCorrectOpcode(Cur) || !Cur->hasNUses(ExpectedUses))
1138+
while (Cur != RdxInstr) {
1139+
if (!Cur || !isCorrectOpcode(Cur) || !Cur->hasNUses(ExpectedUses))
11111140
return {};
11121141

11131142
ReductionOperations.push_back(Cur);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8593,13 +8593,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
85938593
return Operands[0];
85948594
}
85958595

8596+
unsigned NumIncoming = Phi->getNumIncomingValues();
8597+
// For in-loop reductions, we do not need to create an additional select.
8598+
VPValue *InLoopVal = nullptr;
8599+
for (unsigned In = 0; In < NumIncoming; In++) {
8600+
PHINode *PhiOp =
8601+
dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8602+
if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8603+
assert(!InLoopVal && "Found more than one in-loop reduction!");
8604+
InLoopVal = Operands[In];
8605+
}
8606+
}
8607+
8608+
assert((!InLoopVal || NumIncoming == 2) &&
8609+
"Found an in-loop reduction for PHI with unexpected number of "
8610+
"incoming values");
8611+
if (InLoopVal)
8612+
return Operands[Operands[0] == InLoopVal ? 1 : 0];
8613+
85968614
// We know that all PHIs in non-header blocks are converted into selects, so
85978615
// we don't have to worry about the insertion order and we can just use the
85988616
// builder. At this point we generate the predication tree. There may be
85998617
// duplications since this is a simple recursive scan, but future
86008618
// optimizations will clean it up.
86018619
SmallVector<VPValue *, 2> OperandsWithMask;
8602-
unsigned NumIncoming = Phi->getNumIncomingValues();
86038620

86048621
for (unsigned In = 0; In < NumIncoming; In++) {
86058622
VPValue *EdgeMask =
@@ -9423,7 +9440,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
94239440
R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
94249441
VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
94259442

9426-
auto *CondOp = CM.foldTailByMasking()
9443+
auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
94279444
? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
94289445
: nullptr;
94299446

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s
3+
4+
define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias nocapture readonly %cond, i64 %N){
5+
; CHECK-LABEL: @cond_fadd(
6+
; CHECK-NEXT: entry:
7+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
8+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
9+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
10+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
11+
; CHECK: vector.ph:
12+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
13+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
14+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
15+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
16+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
17+
; CHECK: vector.body:
18+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
19+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
20+
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
21+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]]
22+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
23+
; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
24+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
25+
; CHECK-NEXT: [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
26+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
27+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
28+
; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
29+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
30+
; CHECK-NEXT: [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> zeroinitializer
31+
; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP12]])
32+
; CHECK-NEXT: [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
33+
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
34+
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
35+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
36+
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
37+
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
38+
; CHECK: middle.block:
39+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
40+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
41+
; CHECK: scalar.ph:
42+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
43+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
44+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
45+
; CHECK: for.body:
46+
; CHECK-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], [[FOR_INC:%.*]] ]
47+
; CHECK-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
48+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[INDVARS]]
49+
; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4
50+
; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP18]], 2.000000e+00
51+
; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
52+
; CHECK: if.then:
53+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS]]
54+
; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX2]], align 4
55+
; CHECK-NEXT: [[FADD:%.*]] = fadd fast float [[RDX]], [[TMP19]]
56+
; CHECK-NEXT: br label [[FOR_INC]]
57+
; CHECK: for.inc:
58+
; CHECK-NEXT: [[RES]] = phi float [ [[FADD]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
59+
; CHECK-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
60+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[N]]
61+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
62+
; CHECK: for.end:
63+
; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
64+
; CHECK-NEXT: ret float [[RES_LCSSA]]
65+
;
66+
entry:
67+
br label %for.body
68+
69+
for.body:
70+
%indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.inc ]
71+
%rdx = phi float [ 1.000000e+00, %entry ], [ %res, %for.inc ]
72+
%arrayidx = getelementptr inbounds float, float* %cond, i64 %indvars
73+
%0 = load float, float* %arrayidx
74+
%tobool = fcmp une float %0, 2.000000e+00
75+
br i1 %tobool, label %if.then, label %for.inc
76+
77+
if.then:
78+
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars
79+
%1 = load float, float* %arrayidx2
80+
%fadd = fadd fast float %rdx, %1
81+
br label %for.inc
82+
83+
for.inc:
84+
%res = phi float [ %fadd, %if.then ], [ %rdx, %for.body ]
85+
%indvars.next = add nuw nsw i64 %indvars, 1
86+
%exitcond.not = icmp eq i64 %indvars.next, %N
87+
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
88+
89+
for.end:
90+
ret float %res
91+
}
92+
93+
define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) {
94+
; CHECK-LABEL: @cond_cmp_sel(
95+
; CHECK-NEXT: entry:
96+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
97+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
98+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
99+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
100+
; CHECK: vector.ph:
101+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
102+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
103+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
104+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
105+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
106+
; CHECK: vector.body:
107+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
108+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
109+
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
110+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]]
111+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
112+
; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
113+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
114+
; CHECK-NEXT: [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
115+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
116+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
117+
; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
118+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
119+
; CHECK-NEXT: [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 0xFFF0000000000000, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
120+
; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP12]])
121+
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP13]], [[VEC_PHI]]
122+
; CHECK-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP13]], float [[VEC_PHI]]
123+
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
124+
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
125+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
126+
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
127+
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
128+
; CHECK: middle.block:
129+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
130+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
131+
; CHECK: scalar.ph:
132+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
133+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
134+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
135+
; CHECK: for.body:
136+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
137+
; CHECK-NEXT: [[RDX:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
138+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
139+
; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
140+
; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP17]], 3.000000e+00
141+
; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
142+
; CHECK: if.then:
143+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
144+
; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
145+
; CHECK-NEXT: [[FCMP:%.*]] = fcmp fast olt float [[RDX]], [[TMP18]]
146+
; CHECK-NEXT: [[FSEL:%.*]] = select fast i1 [[FCMP]], float [[RDX]], float [[TMP18]]
147+
; CHECK-NEXT: br label [[FOR_INC]]
148+
; CHECK: for.inc:
149+
; CHECK-NEXT: [[RES]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[FSEL]], [[IF_THEN]] ]
150+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
151+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
152+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
153+
; CHECK: for.end:
154+
; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
155+
; CHECK-NEXT: ret float [[RES_LCSSA]]
156+
;
157+
entry:
158+
br label %for.body
159+
160+
for.body:
161+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
162+
%rdx = phi float [ %res, %for.inc ], [ 1.000000e+00, %entry ]
163+
%arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
164+
%0 = load float, float* %arrayidx
165+
%tobool = fcmp une float %0, 3.000000e+00
166+
br i1 %tobool, label %if.then, label %for.inc
167+
168+
if.then:
169+
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
170+
%1 = load float, float* %arrayidx2
171+
%fcmp = fcmp fast olt float %rdx, %1
172+
%fsel = select fast i1 %fcmp, float %rdx, float %1
173+
br label %for.inc
174+
175+
for.inc:
176+
%res = phi float [ %rdx, %for.body ], [ %fsel, %if.then ]
177+
%iv.next = add i64 %iv, 1
178+
%exitcond.not = icmp eq i64 %iv.next, %N
179+
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
180+
181+
for.end:
182+
ret float %res
183+
}
184+
185+
!0 = distinct !{!0, !1}
186+
!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

0 commit comments

Comments
 (0)