Skip to content

Commit 6eb0f45

Browse files
committed
[VPlan] Truncate/Extend ComputeReductionResult at construction (NFC).
Instead of looking up the narrower reduction type via getRecurrenceType we can generate the needed extend directly at constructiond re-use the truncated value from the loop.
1 parent b68565b commit 6eb0f45

File tree

6 files changed

+43
-56
lines changed

6 files changed

+43
-56
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7536,6 +7536,13 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
75367536
// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
75377537
// over the incoming values correctly.
75387538
using namespace VPlanPatternMatch;
7539+
if (EpiRedResult->getNumUsers() == 1 &&
7540+
isa<VPInstructionWithType>(*EpiRedResult->user_begin())) {
7541+
EpiRedResult = cast<VPInstructionWithType>(*EpiRedResult->user_begin());
7542+
assert((EpiRedResult->getOpcode() == Instruction::SExt ||
7543+
EpiRedResult->getOpcode() == Instruction::ZExt) &&
7544+
"can only have SExt/ZExt users");
7545+
}
75397546
assert(count_if(EpiRedResult->users(), IsaPred<VPPhi>) == 1 &&
75407547
"ResumePhi must have a single user");
75417548
auto *EpiResumePhiVPI =
@@ -9468,28 +9475,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
94689475
PhiR->setOperand(1, NewExitingVPV);
94699476
}
94709477

9471-
// If the vector reduction can be performed in a smaller type, we truncate
9472-
// then extend the loop exit value to enable InstCombine to evaluate the
9473-
// entire expression in the smaller type.
9474-
if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9475-
!RecurrenceDescriptor::isAnyOfRecurrenceKind(
9476-
RdxDesc.getRecurrenceKind())) {
9477-
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9478-
Type *RdxTy = RdxDesc.getRecurrenceType();
9479-
auto *Trunc =
9480-
new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9481-
auto *Extnd =
9482-
RdxDesc.isSigned()
9483-
? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9484-
: new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9485-
9486-
Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9487-
Extnd->insertAfter(Trunc);
9488-
if (PhiR->getOperand(1) == NewExitingVPV)
9489-
PhiR->setOperand(1, Extnd->getVPSingleValue());
9490-
NewExitingVPV = Extnd;
9491-
}
9492-
94939478
// We want code in the middle block to appear to execute on the location of
94949479
// the scalar loop's latch terminator because: (a) it is all compiler
94959480
// generated, (b) these instructions are always executed after evaluating
@@ -9521,6 +9506,31 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
95219506
Builder.createNaryOp(VPInstruction::ComputeReductionResult,
95229507
{PhiR, NewExitingVPV}, Flags, ExitDL);
95239508
}
9509+
// If the vector reduction can be performed in a smaller type, we truncate
9510+
// then extend the loop exit value to enable InstCombine to evaluate the
9511+
// entire expression in the smaller type.
9512+
if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9513+
!RecurrenceDescriptor::isAnyOfRecurrenceKind(
9514+
RdxDesc.getRecurrenceKind())) {
9515+
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9516+
Type *RdxTy = RdxDesc.getRecurrenceType();
9517+
auto *Trunc =
9518+
new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9519+
Instruction::CastOps ExtendOpc =
9520+
RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
9521+
auto *Extnd = new VPWidenCastRecipe(ExtendOpc, Trunc, PhiTy);
9522+
Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9523+
Extnd->insertAfter(Trunc);
9524+
if (PhiR->getOperand(1) == NewExitingVPV)
9525+
PhiR->setOperand(1, Extnd->getVPSingleValue());
9526+
9527+
// Update ComputeReductionResult with the truncated exiting value and
9528+
// extend its result.
9529+
FinalReductionResult->setOperand(1, Trunc);
9530+
FinalReductionResult =
9531+
Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
9532+
}
9533+
95249534
// Update all users outside the vector region.
95259535
OrigExitingVPV->replaceUsesWithIf(
95269536
FinalReductionResult, [FinalReductionResult](VPUser &User, unsigned) {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -633,15 +633,13 @@ Value *VPInstruction::generate(VPTransformState &State) {
633633
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
634634
// and will be removed by breaking up the recipe further.
635635
auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
636-
auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
637636
// Get its reduction variable descriptor.
638637
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
639638

640639
RecurKind RK = RdxDesc.getRecurrenceKind();
641640
assert(!RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) &&
642641
"should be handled by ComputeFindLastIVResult");
643642

644-
Type *PhiTy = OrigPhi->getType();
645643
// The recipe's operands are the reduction phi, followed by one operand for
646644
// each part of the reduction.
647645
unsigned UF = getNumOperands() - 1;
@@ -653,15 +651,6 @@ Value *VPInstruction::generate(VPTransformState &State) {
653651
if (hasFastMathFlags())
654652
Builder.setFastMathFlags(getFastMathFlags());
655653

656-
// If the vector reduction can be performed in a smaller type, we truncate
657-
// then extend the loop exit value to enable InstCombine to evaluate the
658-
// entire expression in the smaller type.
659-
// TODO: Handle this in truncateToMinBW.
660-
if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
661-
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
662-
for (unsigned Part = 0; Part < UF; ++Part)
663-
RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
664-
}
665654
// Reduce all of the unrolled parts into a single vector.
666655
Value *ReducedPartRdx = RdxParts[0];
667656
if (PhiR->isOrdered()) {
@@ -687,19 +676,14 @@ Value *VPInstruction::generate(VPTransformState &State) {
687676
// TODO: Support in-order reductions based on the recurrence descriptor.
688677
// All ops in the reduction inherit fast-math-flags from the recurrence
689678
// descriptor.
690-
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
679+
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
680+
auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
691681
ReducedPartRdx =
692682
createAnyOfReduction(Builder, ReducedPartRdx,
693683
RdxDesc.getRecurrenceStartValue(), OrigPhi);
694-
else
684+
} else
695685
ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
696686

697-
// If the reduction can be performed in a smaller type, we need to extend
698-
// the reduction to the wider type before we branch to the original loop.
699-
if (PhiTy != RdxDesc.getRecurrenceType())
700-
ReducedPartRdx = RdxDesc.isSigned()
701-
? Builder.CreateSExt(ReducedPartRdx, PhiTy)
702-
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
703687
}
704688

705689
return ReducedPartRdx;
@@ -1040,6 +1024,7 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
10401024
void VPInstructionWithType::execute(VPTransformState &State) {
10411025
State.setDebugLocFrom(getDebugLoc());
10421026
switch (getOpcode()) {
1027+
case Instruction::SExt:
10431028
case Instruction::ZExt:
10441029
case Instruction::Trunc: {
10451030
Value *Op = State.get(getOperand(0), VPLane(0));

llvm/test/Transforms/LoopVectorize/X86/cost-model.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1167,8 +1167,7 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 {
11671167
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
11681168
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
11691169
; CHECK: middle.block:
1170-
; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i1>
1171-
; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP10]])
1170+
; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
11721171
; CHECK-NEXT: [[TMP21:%.*]] = zext i1 [[TMP20]] to i32
11731172
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
11741173
; CHECK: scalar.ph:

llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,7 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
208208
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
209209
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
210210
; CHECK: middle.block:
211-
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i16>
212-
; CHECK-NEXT: [[TMP10:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP9]])
211+
; CHECK-NEXT: [[TMP10:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]])
213212
; CHECK-NEXT: [[TMP11:%.*]] = zext i16 [[TMP10]] to i32
214213
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
215214
; CHECK: vec.epilog.iter.check:
@@ -234,8 +233,7 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
234233
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT4]], 256
235234
; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
236235
; CHECK: vec.epilog.middle.block:
237-
; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16>
238-
; CHECK-NEXT: [[TMP23:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP22]])
236+
; CHECK-NEXT: [[TMP23:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP19]])
239237
; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP23]] to i32
240238
; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
241239
; CHECK: vec.epilog.scalar.ph:

llvm/test/Transforms/LoopVectorize/reduction-small-size.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
2525
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
2626
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
2727
; CHECK: middle.block:
28-
; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
29-
; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP6]])
28+
; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
3029
; CHECK-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
3130
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
3231
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -104,8 +103,7 @@ define i8 @PR34687_no_undef(i1 %c, i32 %x, i32 %n) {
104103
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
105104
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
106105
; CHECK: middle.block:
107-
; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i8>
108-
; CHECK-NEXT: [[TMP9:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP8]])
106+
; CHECK-NEXT: [[TMP9:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP5]])
109107
; CHECK-NEXT: [[TMP10:%.*]] = zext i8 [[TMP9]] to i32
110108
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
111109
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -183,8 +181,7 @@ define i32 @PR35734(i32 %x, i32 %y) {
183181
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
184182
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
185183
; CHECK: middle.block:
186-
; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i1>
187-
; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> [[TMP8]])
184+
; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> [[TMP5]])
188185
; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i32
189186
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
190187
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]

llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,7 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
2828
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP31]]
2929
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], {{%.*}}
3030
; CHECK: middle.block:
31-
; CHECK-NEXT: [[TMP37:%.*]] = trunc <vscale x 8 x i32> [[TMP34]] to <vscale x 8 x i8>
32-
; CHECK-NEXT: [[TMP38:%.*]] = trunc <vscale x 8 x i32> [[TMP36]] to <vscale x 8 x i8>
33-
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i8> [[TMP38]], [[TMP37]]
31+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i8> [[TMP35]], [[TMP33]]
3432
; CHECK-NEXT: [[TMP39:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[BIN_RDX]])
3533
; CHECK-NEXT: [[TMP40:%.*]] = zext i8 [[TMP39]] to i32
3634
; CHECK-NEXT: %cmp.n = icmp eq i32 256, %n.vec

0 commit comments

Comments
 (0)