Skip to content

Commit 3ce39e4

Browse files
committed
Init implement
1 parent 3d2fd31 commit 3ce39e4

File tree

4 files changed

+49
-74
lines changed

4 files changed

+49
-74
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 32 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2388,8 +2388,8 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
23882388
AC->registerAssumption(II);
23892389

23902390
// End if-block.
2391-
bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2392-
if (IfPredicateInstr)
2391+
const VPRegionBlock *Region = RepRecipe->getParent()->getParent();
2392+
if (Region && Region->isReplicator())
23932393
PredicatedInstructions.push_back(Cloned);
23942394
}
23952395

@@ -8901,6 +8901,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89018901
bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
89028902
return Legal->blockNeedsPredication(BB) || NeedsBlends;
89038903
});
8904+
auto *MiddleVPBB =
8905+
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
8906+
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
89048907
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
89058908
// Relevant instructions from basic block BB will be grouped into VPRecipe
89068909
// ingredients and fill a new VPBasicBlock.
@@ -8931,8 +8934,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89318934
// with the final reduction value will be added to the exit block
89328935
StoreInst *SI;
89338936
if ((SI = dyn_cast<StoreInst>(&I)) &&
8934-
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8937+
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8938+
// Only create recipe for the last intermediate store of the reduction.
8939+
if (!Legal->isInvariantStoreOfReduction(SI))
8940+
continue;
8941+
auto *Recipe = new VPReplicateRecipe(
8942+
SI, RecipeBuilder.mapToVPValues(Instr->operands()),
8943+
true /* IsUniform */);
8944+
RecipeBuilder.setRecipe(SI, Recipe);
8945+
Recipe->insertBefore(*MiddleVPBB, MBIP);
89358946
continue;
8947+
}
89368948

89378949
VPRecipeBase *Recipe =
89388950
RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
@@ -9130,51 +9142,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91309142
using namespace VPlanPatternMatch;
91319143
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
91329144
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9133-
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
9134-
// sank outside of the loop would keep the same order as they had in the
9135-
// original loop.
9136-
SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
9137-
for (VPRecipeBase &R : Header->phis()) {
9138-
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
9139-
ReductionPHIList.emplace_back(ReductionPhi);
9140-
}
9141-
bool HasIntermediateStore = false;
9142-
stable_sort(ReductionPHIList,
9143-
[this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
9144-
const VPReductionPHIRecipe *R2) {
9145-
auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
9146-
auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
9147-
HasIntermediateStore |= IS1 || IS2;
9148-
9149-
// If neither of the recipes has an intermediate store, keep the
9150-
// order the same.
9151-
if (!IS1 && !IS2)
9152-
return false;
9153-
9154-
// If only one of the recipes has an intermediate store, then
9155-
// move it towards the beginning of the list.
9156-
if (IS1 && !IS2)
9157-
return true;
9158-
9159-
if (!IS1 && IS2)
9160-
return false;
9161-
9162-
// If both recipes have an intermediate store, then the recipe
9163-
// with the later store should be processed earlier. So it
9164-
// should go to the beginning of the list.
9165-
return DT->dominates(IS2, IS1);
9166-
});
9167-
9168-
if (HasIntermediateStore && ReductionPHIList.size() > 1)
9169-
for (VPRecipeBase *R : ReductionPHIList)
9170-
R->moveBefore(*Header, Header->getFirstNonPhi());
9171-
91729145
for (VPRecipeBase &R : Header->phis()) {
91739146
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
91749147
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
91759148
continue;
91769149

91779150
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9151+
StoreInst* IntermediateStore = RdxDesc.IntermediateStore;
91789152
RecurKind Kind = RdxDesc.getRecurrenceKind();
91799153
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
91809154
"AnyOf reductions are not allowed for in-loop reductions");
@@ -9187,9 +9161,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91879161
for (VPUser *U : Cur->users()) {
91889162
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
91899163
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9190-
assert(match(U, m_Binary<VPInstruction::ExtractFromEnd>(
9191-
m_VPValue(), m_VPValue())) &&
9192-
"U must be an ExtractFromEnd VPInstruction");
9164+
assert((match(U, m_Binary<VPInstruction::ExtractFromEnd>(
9165+
m_VPValue(), m_VPValue())) ||
9166+
(isa<VPReplicateRecipe>(U) &&
9167+
cast<VPReplicateRecipe>(U)->getUnderlyingValue() ==
9168+
IntermediateStore)) &&
9169+
"U must be either an ExtractFromEnd VPInstruction or a "
9170+
"uniform store sourced from the intermediate store.");
91939171
continue;
91949172
}
91959173
Worklist.insert(UserRecipe);
@@ -9304,6 +9282,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
93049282
continue;
93059283

93069284
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9285+
StoreInst *IntermediateStore = RdxDesc.IntermediateStore;
93079286
// Adjust AnyOf reductions; replace the reduction phi for the selected value
93089287
// with a boolean reduction phi node to check if the condition is true in
93099288
// any iteration. The final value is selected by the final
@@ -9406,11 +9385,14 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
94069385
auto *FinalReductionResult = new VPInstruction(
94079386
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
94089387
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9409-
OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User,
9410-
unsigned) {
9411-
return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue(),
9412-
m_VPValue()));
9413-
});
9388+
OrigExitingVPV->replaceUsesWithIf(
9389+
FinalReductionResult, [IntermediateStore](VPUser &User, unsigned) {
9390+
return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(
9391+
m_VPValue(), m_VPValue())) ||
9392+
(isa<VPReplicateRecipe>(&User) &&
9393+
cast<VPReplicateRecipe>(&User)->getUnderlyingValue() ==
9394+
IntermediateStore);
9395+
});
94149396
}
94159397

94169398
VPlanTransforms::clearReductionWrapFlags(*Plan);

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -601,14 +601,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
601601
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
602602
}
603603

604-
// If there were stores of the reduction value to a uniform memory address
605-
// inside the loop, create the final store here.
606-
if (StoreInst *SI = RdxDesc.IntermediateStore) {
607-
auto *NewSI = Builder.CreateAlignedStore(
608-
ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
609-
propagateMetadata(NewSI, SI);
610-
}
611-
612604
return ReducedPartRdx;
613605
}
614606
case VPInstruction::ExtractFromEnd: {

llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -596,10 +596,10 @@ exit: ; preds = %for.body
596596
define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
597597
; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
598598
; CHECK: middle.block:
599-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
600-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
601-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
602-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
599+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
600+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
601+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
602+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
603603
;
604604
entry:
605605
br label %for.body
@@ -625,10 +625,10 @@ exit:
625625
define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) {
626626
; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
627627
; CHECK: middle.block:
628-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
629-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
630-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
631-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
628+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
629+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
630+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
631+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
632632
;
633633
entry:
634634
br label %for.body
@@ -655,10 +655,10 @@ exit:
655655
define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
656656
; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
657657
; CHECK: middle.block:
658-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
659-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
660-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
661-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
658+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
659+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
660+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst1, align 4
661+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst2, align 4
662662
;
663663
entry:
664664
br label %for.body
@@ -684,10 +684,10 @@ exit:
684684
define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
685685
; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
686686
; CHECK: middle.block:
687-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
688-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
689-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
690-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
687+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
688+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
689+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst1, align 4
690+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst2, align 4
691691
;
692692
entry:
693693
br label %for.body

llvm/test/Transforms/LoopVectorize/vplan-printing.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
212212
; CHECK-EMPTY:
213213
; CHECK-NEXT: middle.block:
214214
; CHECK-NEXT: EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
215+
; CHECK-NEXT: CLONE store vp<[[RED_RES]]>, ir<%dst>
215216
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]>
216217
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
217218
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph

0 commit comments

Comments
 (0)