Skip to content

Commit e6bbf55

Browse files
committed
Init implement
1 parent b953914 commit e6bbf55

File tree

4 files changed

+47
-72
lines changed

4 files changed

+47
-72
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 30 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -8926,6 +8926,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89268926
bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
89278927
return Legal->blockNeedsPredication(BB) || NeedsBlends;
89288928
});
8929+
auto *MiddleVPBB =
8930+
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
8931+
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
89298932
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
89308933
// Relevant instructions from basic block BB will be grouped into VPRecipe
89318934
// ingredients and fill a new VPBasicBlock.
@@ -8956,8 +8959,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89568959
// with the final reduction value will be added to the exit block
89578960
StoreInst *SI;
89588961
if ((SI = dyn_cast<StoreInst>(&I)) &&
8959-
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8962+
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8963+
// Only create recipe for the last intermediate store of the reduction.
8964+
if (!Legal->isInvariantStoreOfReduction(SI))
8965+
continue;
8966+
auto *Recipe = new VPReplicateRecipe(
8967+
SI, RecipeBuilder.mapToVPValues(Instr->operands()),
8968+
true /* IsUniform */);
8969+
RecipeBuilder.setRecipe(SI, Recipe);
8970+
Recipe->insertBefore(*MiddleVPBB, MBIP);
89608971
continue;
8972+
}
89618973

89628974
VPRecipeBase *Recipe =
89638975
RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
@@ -9126,51 +9138,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91269138
using namespace VPlanPatternMatch;
91279139
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
91289140
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9129-
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
9130-
// sank outside of the loop would keep the same order as they had in the
9131-
// original loop.
9132-
SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
9133-
for (VPRecipeBase &R : Header->phis()) {
9134-
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
9135-
ReductionPHIList.emplace_back(ReductionPhi);
9136-
}
9137-
bool HasIntermediateStore = false;
9138-
stable_sort(ReductionPHIList,
9139-
[this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
9140-
const VPReductionPHIRecipe *R2) {
9141-
auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
9142-
auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
9143-
HasIntermediateStore |= IS1 || IS2;
9144-
9145-
// If neither of the recipes has an intermediate store, keep the
9146-
// order the same.
9147-
if (!IS1 && !IS2)
9148-
return false;
9149-
9150-
// If only one of the recipes has an intermediate store, then
9151-
// move it towards the beginning of the list.
9152-
if (IS1 && !IS2)
9153-
return true;
9154-
9155-
if (!IS1 && IS2)
9156-
return false;
9157-
9158-
// If both recipes have an intermediate store, then the recipe
9159-
// with the later store should be processed earlier. So it
9160-
// should go to the beginning of the list.
9161-
return DT->dominates(IS2, IS1);
9162-
});
9163-
9164-
if (HasIntermediateStore && ReductionPHIList.size() > 1)
9165-
for (VPRecipeBase *R : ReductionPHIList)
9166-
R->moveBefore(*Header, Header->getFirstNonPhi());
9167-
91689141
for (VPRecipeBase &R : Header->phis()) {
91699142
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
91709143
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
91719144
continue;
91729145

91739146
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9147+
StoreInst *IntermediateStore = RdxDesc.IntermediateStore;
91749148
RecurKind Kind = RdxDesc.getRecurrenceKind();
91759149
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
91769150
"AnyOf reductions are not allowed for in-loop reductions");
@@ -9183,9 +9157,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91839157
for (VPUser *U : Cur->users()) {
91849158
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
91859159
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9186-
assert(match(U, m_Binary<VPInstruction::ExtractFromEnd>(
9187-
m_VPValue(), m_VPValue())) &&
9188-
"U must be an ExtractFromEnd VPInstruction");
9160+
assert((match(U, m_Binary<VPInstruction::ExtractFromEnd>(
9161+
m_VPValue(), m_VPValue())) ||
9162+
(isa<VPReplicateRecipe>(U) &&
9163+
cast<VPReplicateRecipe>(U)->getUnderlyingValue() ==
9164+
IntermediateStore)) &&
9165+
"U must be either an ExtractFromEnd VPInstruction or a "
9166+
"uniform store sourced from the intermediate store.");
91899167
continue;
91909168
}
91919169
Worklist.insert(UserRecipe);
@@ -9300,6 +9278,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
93009278
continue;
93019279

93029280
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9281+
StoreInst *IntermediateStore = RdxDesc.IntermediateStore;
93039282
// Adjust AnyOf reductions; replace the reduction phi for the selected value
93049283
// with a boolean reduction phi node to check if the condition is true in
93059284
// any iteration. The final value is selected by the final
@@ -9402,11 +9381,14 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
94029381
auto *FinalReductionResult = new VPInstruction(
94039382
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
94049383
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9405-
OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User,
9406-
unsigned) {
9407-
return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue(),
9408-
m_VPValue()));
9409-
});
9384+
OrigExitingVPV->replaceUsesWithIf(
9385+
FinalReductionResult, [IntermediateStore](VPUser &User, unsigned) {
9386+
return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(
9387+
m_VPValue(), m_VPValue())) ||
9388+
(isa<VPReplicateRecipe>(&User) &&
9389+
cast<VPReplicateRecipe>(&User)->getUnderlyingValue() ==
9390+
IntermediateStore);
9391+
});
94109392
}
94119393

94129394
VPlanTransforms::clearReductionWrapFlags(*Plan);

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -614,14 +614,6 @@ Value *VPInstruction::generate(VPTransformState &State) {
614614
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
615615
}
616616

617-
// If there were stores of the reduction value to a uniform memory address
618-
// inside the loop, create the final store here.
619-
if (StoreInst *SI = RdxDesc.IntermediateStore) {
620-
auto *NewSI = Builder.CreateAlignedStore(
621-
ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
622-
propagateMetadata(NewSI, SI);
623-
}
624-
625617
return ReducedPartRdx;
626618
}
627619
case VPInstruction::ExtractFromEnd: {

llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -596,10 +596,10 @@ exit: ; preds = %for.body
596596
define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
597597
; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
598598
; CHECK: middle.block:
599-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
600-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
601-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
602-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
599+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
600+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
601+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
602+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
603603
;
604604
entry:
605605
br label %for.body
@@ -625,10 +625,10 @@ exit:
625625
define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) {
626626
; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
627627
; CHECK: middle.block:
628-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
629-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
630-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
631-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
628+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
629+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
630+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
631+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
632632
;
633633
entry:
634634
br label %for.body
@@ -655,10 +655,10 @@ exit:
655655
define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
656656
; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
657657
; CHECK: middle.block:
658-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
659-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
660-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
661-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
658+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
659+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
660+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst1, align 4
661+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst2, align 4
662662
;
663663
entry:
664664
br label %for.body
@@ -684,10 +684,10 @@ exit:
684684
define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
685685
; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
686686
; CHECK: middle.block:
687-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
688-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
689-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
690-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
687+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
688+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
689+
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst1, align 4
690+
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst2, align 4
691691
;
692692
entry:
693693
br label %for.body

llvm/test/Transforms/LoopVectorize/vplan-printing.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
212212
; CHECK-EMPTY:
213213
; CHECK-NEXT: middle.block:
214214
; CHECK-NEXT: EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
215+
; CHECK-NEXT: CLONE store vp<[[RED_RES]]>, ir<%dst>
215216
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
216217
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
217218
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph

0 commit comments

Comments
 (0)