Skip to content

Commit f8373cb

Browse files
Mel-Chenfhahn
andauthored
[LV] Reuse VPReplicateRecipe to handle scalar stores in exit block. (#106342)
This patch separates the computation of the final reduction result and the intermediate stores of reduction. --------- Co-authored-by: Florian Hahn <[email protected]>
1 parent 147558e commit f8373cb

File tree

4 files changed

+48
-76
lines changed

4 files changed

+48
-76
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 31 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2346,6 +2346,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
23462346
// End if-block.
23472347
VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
23482348
bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2349+
assert((Parent || all_of(RepRecipe->operands(),
2350+
[](VPValue *Op) {
2351+
return Op->isDefinedOutsideLoopRegions();
2352+
})) &&
2353+
"Expected a recipe is either within a region or all of its operands "
2354+
"are defined outside the vectorized region.");
23492355
if (IfPredicateInstr)
23502356
PredicatedInstructions.push_back(Cloned);
23512357
}
@@ -8950,6 +8956,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89508956
bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
89518957
return Legal->blockNeedsPredication(BB) || NeedsBlends;
89528958
});
8959+
auto *MiddleVPBB =
8960+
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
8961+
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
89538962
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
89548963
// Relevant instructions from basic block BB will be grouped into VPRecipe
89558964
// ingredients and fill a new VPBasicBlock.
@@ -8976,12 +8985,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89768985
Operands = {OpRange.begin(), OpRange.end()};
89778986
}
89788987

8979-
// Invariant stores inside loop will be deleted and a single store
8980-
// with the final reduction value will be added to the exit block
8988+
// The stores with invariant address inside the loop will be deleted, and
8989+
// in the exit block, a uniform store recipe will be created for the final
8990+
// invariant store of the reduction.
89818991
StoreInst *SI;
89828992
if ((SI = dyn_cast<StoreInst>(&I)) &&
8983-
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8993+
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8994+
// Only create recipe for the final invariant store of the reduction.
8995+
if (!Legal->isInvariantStoreOfReduction(SI))
8996+
continue;
8997+
auto *Recipe = new VPReplicateRecipe(
8998+
SI, RecipeBuilder.mapToVPValues(Instr->operands()),
8999+
true /* IsUniform */);
9000+
Recipe->insertBefore(*MiddleVPBB, MBIP);
89849001
continue;
9002+
}
89859003

89869004
VPRecipeBase *Recipe =
89879005
RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
@@ -9150,45 +9168,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91509168
using namespace VPlanPatternMatch;
91519169
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
91529170
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9153-
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
9154-
// sank outside of the loop would keep the same order as they had in the
9155-
// original loop.
9156-
SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
9157-
for (VPRecipeBase &R : Header->phis()) {
9158-
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
9159-
ReductionPHIList.emplace_back(ReductionPhi);
9160-
}
9161-
bool HasIntermediateStore = false;
9162-
stable_sort(ReductionPHIList,
9163-
[this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
9164-
const VPReductionPHIRecipe *R2) {
9165-
auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
9166-
auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
9167-
HasIntermediateStore |= IS1 || IS2;
9168-
9169-
// If neither of the recipes has an intermediate store, keep the
9170-
// order the same.
9171-
if (!IS1 && !IS2)
9172-
return false;
9173-
9174-
// If only one of the recipes has an intermediate store, then
9175-
// move it towards the beginning of the list.
9176-
if (IS1 && !IS2)
9177-
return true;
9178-
9179-
if (!IS1 && IS2)
9180-
return false;
9181-
9182-
// If both recipes have an intermediate store, then the recipe
9183-
// with the later store should be processed earlier. So it
9184-
// should go to the beginning of the list.
9185-
return DT->dominates(IS2, IS1);
9186-
});
9187-
9188-
if (HasIntermediateStore && ReductionPHIList.size() > 1)
9189-
for (VPRecipeBase *R : ReductionPHIList)
9190-
R->moveBefore(*Header, Header->getFirstNonPhi());
9191-
9171+
VPBasicBlock *MiddleVPBB =
9172+
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
91929173
for (VPRecipeBase &R : Header->phis()) {
91939174
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
91949175
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9207,9 +9188,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
92079188
for (VPUser *U : Cur->users()) {
92089189
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
92099190
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9210-
assert(match(U, m_Binary<VPInstruction::ExtractFromEnd>(
9211-
m_VPValue(), m_VPValue())) &&
9212-
"U must be an ExtractFromEnd VPInstruction");
9191+
assert(UserRecipe->getParent() == MiddleVPBB &&
9192+
"U must be either in the loop region or the middle block.");
92139193
continue;
92149194
}
92159195
Worklist.insert(UserRecipe);
@@ -9314,8 +9294,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
93149294
}
93159295
VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
93169296
Builder.setInsertPoint(&*LatchVPBB->begin());
9317-
VPBasicBlock *MiddleVPBB =
9318-
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
93199297
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
93209298
for (VPRecipeBase &R :
93219299
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
@@ -9390,12 +9368,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
93909368
// also modeled in VPlan.
93919369
auto *FinalReductionResult = new VPInstruction(
93929370
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9371+
// Update all users outside the vector region.
9372+
OrigExitingVPV->replaceUsesWithIf(
9373+
FinalReductionResult, [](VPUser &User, unsigned) {
9374+
auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9375+
return Parent && !Parent->getParent();
9376+
});
93939377
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9394-
OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User,
9395-
unsigned) {
9396-
return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue(),
9397-
m_VPValue()));
9398-
});
93999378

94009379
// Adjust AnyOf reductions; replace the reduction phi for the selected value
94019380
// with a boolean reduction phi node to check if the condition is true in

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -611,14 +611,6 @@ Value *VPInstruction::generate(VPTransformState &State) {
611611
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
612612
}
613613

614-
// If there were stores of the reduction value to a uniform memory address
615-
// inside the loop, create the final store here.
616-
if (StoreInst *SI = RdxDesc.IntermediateStore) {
617-
auto *NewSI = Builder.CreateAlignedStore(
618-
ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
619-
propagateMetadata(NewSI, SI);
620-
}
621-
622614
return ReducedPartRdx;
623615
}
624616
case VPInstruction::ExtractFromEnd: {

llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -596,10 +596,10 @@ exit: ; preds = %for.body
596596
define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
597597
; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
598598
; CHECK: middle.block:
599-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
600-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
601-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
602-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
599+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
600+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
601+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
602+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
603603
;
604604
entry:
605605
br label %for.body
@@ -625,10 +625,10 @@ exit:
625625
define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) {
626626
; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
627627
; CHECK: middle.block:
628-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
629-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
630-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
631-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
628+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
629+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
630+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
631+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
632632
;
633633
entry:
634634
br label %for.body
@@ -655,10 +655,10 @@ exit:
655655
define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
656656
; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
657657
; CHECK: middle.block:
658-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
659-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
660-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
661-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
658+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
659+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
660+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst1, align 4
661+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst2, align 4
662662
;
663663
entry:
664664
br label %for.body
@@ -684,10 +684,10 @@ exit:
684684
define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
685685
; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
686686
; CHECK: middle.block:
687-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
688-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
689-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
690-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
687+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
688+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
689+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst1, align 4
690+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst2, align 4
691691
;
692692
entry:
693693
br label %for.body

llvm/test/Transforms/LoopVectorize/vplan-printing.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
212212
; CHECK-EMPTY:
213213
; CHECK-NEXT: middle.block:
214214
; CHECK-NEXT: EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
215+
; CHECK-NEXT: CLONE store vp<[[RED_RES]]>, ir<%dst>
215216
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
216217
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
217218
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph

0 commit comments

Comments
 (0)