Skip to content

Commit 9bb3ad8

Browse files
committed
[VPlan] Model address separately.
Move vector pointer generation to a separate VPInstruction opcode. This untangles address computation from the memory recipes future and is also needed to enable explicit unrolling in VPlan. Pull Request: #72164
1 parent 6a126e2 commit 9bb3ad8

File tree

62 files changed

+868
-794
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+868
-794
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 15 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -8268,13 +8268,22 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
82688268
bool Consecutive =
82698269
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
82708270

8271+
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8272+
if (Decision != LoopVectorizationCostModel::CM_GatherScatter &&
8273+
Decision != LoopVectorizationCostModel::CM_Interleave) {
8274+
auto *VectorPtr = new VPInstruction(
8275+
Reverse ? VPInstruction::VectorPtrReverse : VPInstruction::VectorPtr,
8276+
{Ptr}, I->getDebugLoc());
8277+
Builder.getInsertBlock()->appendRecipe(VectorPtr);
8278+
Ptr = VectorPtr;
8279+
}
82718280
if (LoadInst *Load = dyn_cast<LoadInst>(I))
8272-
return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8273-
Consecutive, Reverse);
8281+
return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8282+
Reverse);
82748283

82758284
StoreInst *Store = cast<StoreInst>(I);
8276-
return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8277-
Mask, Consecutive, Reverse);
8285+
return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8286+
Consecutive, Reverse);
82788287
}
82798288

82808289
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -9569,44 +9578,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
95699578
BlockInMaskParts[Part] = Mask;
95709579
}
95719580

9572-
const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9573-
// Calculate the pointer for the specific unroll-part.
9574-
Value *PartPtr = nullptr;
9575-
9576-
// Use i32 for the gep index type when the value is constant,
9577-
// or query DataLayout for a more suitable index type otherwise.
9578-
const DataLayout &DL =
9579-
Builder.GetInsertBlock()->getModule()->getDataLayout();
9580-
Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
9581-
? DL.getIndexType(PointerType::getUnqual(
9582-
ScalarDataTy->getContext()))
9583-
: Builder.getInt32Ty();
9584-
bool InBounds = false;
9585-
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9586-
InBounds = gep->isInBounds();
9587-
if (isReverse()) {
9588-
// If the address is consecutive but reversed, then the
9589-
// wide store needs to start at the last vector element.
9590-
// RunTimeVF = VScale * VF.getKnownMinValue()
9591-
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9592-
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
9593-
// NumElt = -Part * RunTimeVF
9594-
Value *NumElt =
9595-
Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
9596-
// LastLane = 1 - RunTimeVF
9597-
Value *LastLane =
9598-
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
9599-
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
9600-
PartPtr =
9601-
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
9602-
} else {
9603-
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
9604-
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
9605-
}
9606-
9607-
return PartPtr;
9608-
};
9609-
96109581
// Handle Stores:
96119582
if (SI) {
96129583
State.setDebugLocFrom(SI->getDebugLoc());
@@ -9627,8 +9598,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
96279598
// We don't want to update the value in the map as it might be used in
96289599
// another expression. So don't call resetVectorValue(StoredVal).
96299600
}
9630-
auto *VecPtr =
9631-
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9601+
auto *VecPtr = State.get(getAddr(), Part);
96329602
if (isMaskRequired)
96339603
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
96349604
BlockInMaskParts[Part]);
@@ -9652,8 +9622,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
96529622
nullptr, "wide.masked.gather");
96539623
State.addMetadata(NewLI, LI);
96549624
} else {
9655-
auto *VecPtr =
9656-
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9625+
auto *VecPtr = State.get(getAddr(), Part);
96579626
if (isMaskRequired)
96589627
NewLI = Builder.CreateMaskedLoad(
96599628
DataTy, VecPtr, Alignment, BlockInMaskParts[Part],

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1038,7 +1038,9 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
10381038
// canonical IV separately for each unrolled part.
10391039
CanonicalIVIncrementForPart,
10401040
BranchOnCount,
1041-
BranchOnCond
1041+
BranchOnCond,
1042+
VectorPtr,
1043+
VectorPtrReverse
10421044
};
10431045

10441046
private:
@@ -1146,6 +1148,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
11461148
case VPInstruction::CanonicalIVIncrement:
11471149
case VPInstruction::CanonicalIVIncrementForPart:
11481150
case VPInstruction::BranchOnCount:
1151+
case VPInstruction::VectorPtr:
1152+
case VPInstruction::VectorPtrReverse:
11491153
return true;
11501154
};
11511155
llvm_unreachable("switch should return");

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
122122
case VPInstruction::CalculateTripCountMinusVF:
123123
case VPInstruction::CanonicalIVIncrement:
124124
case VPInstruction::CanonicalIVIncrementForPart:
125+
case VPInstruction::VectorPtr:
126+
case VPInstruction::VectorPtrReverse:
125127
return false;
126128
default:
127129
return true;
@@ -404,6 +406,50 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
404406
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
405407
return CondBr;
406408
}
409+
case VPInstruction::VectorPtr:
410+
case VPInstruction::VectorPtrReverse: {
411+
// Calculate the pointer for the specific unroll-part.
412+
Value *PartPtr = nullptr;
413+
bool IsReverse = getOpcode() == VPInstruction::VectorPtrReverse;
414+
auto *MemR = cast<VPWidenMemoryInstructionRecipe>(*user_begin());
415+
Type *ScalarDataTy =
416+
MemR->isStore() ? cast<StoreInst>(&MemR->getIngredient())
417+
->getValueOperand()
418+
->getType()
419+
: cast<LoadInst>(&MemR->getIngredient())->getType();
420+
// Use i32 for the gep index type when the value is constant,
421+
// or query DataLayout for a more suitable index type otherwise.
422+
const DataLayout &DL =
423+
Builder.GetInsertBlock()->getModule()->getDataLayout();
424+
Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
425+
? DL.getIndexType(ScalarDataTy->getPointerTo())
426+
: Builder.getInt32Ty();
427+
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
428+
bool InBounds = false;
429+
if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
430+
InBounds = GEP->isInBounds();
431+
if (IsReverse) {
432+
// If the address is consecutive but reversed, then the
433+
// wide store needs to start at the last vector element.
434+
// RunTimeVF = VScale * VF.getKnownMinValue()
435+
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
436+
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
437+
// NumElt = -Part * RunTimeVF
438+
Value *NumElt = Builder.CreateMul(
439+
ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
440+
// LastLane = 1 - RunTimeVF
441+
Value *LastLane =
442+
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
443+
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
444+
PartPtr =
445+
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
446+
} else {
447+
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
448+
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
449+
}
450+
451+
return PartPtr;
452+
}
407453
default:
408454
llvm_unreachable("Unsupported opcode for instruction");
409455
}
@@ -483,6 +529,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
483529
case VPInstruction::BranchOnCount:
484530
O << "branch-on-count";
485531
break;
532+
case VPInstruction::VectorPtr:
533+
O << "vector-pointer";
534+
break;
535+
case VPInstruction::VectorPtrReverse:
536+
O << "vector-pointer-reverse";
537+
break;
486538
default:
487539
O << Instruction::getOpcodeName(getOpcode());
488540
}

llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
179179
; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i16> [[TMP6]] to <16 x i8>
180180
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[INDEX]] to i64
181181
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP9]]
182-
; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1
183182
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 16
183+
; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1
184184
; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP11]], align 1
185185
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
186186
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
@@ -193,18 +193,18 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
193193
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0
194194
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
195195
; CHECK: vec.epilog.vector.body:
196-
; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
196+
; CHECK-NEXT: [[INDEX3:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
197197
; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[A]] to i16
198198
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> undef, i16 [[TMP14]], i64 0
199199
; CHECK-NEXT: [[TMP16:%.*]] = mul <8 x i16> [[TMP15]], [[TMP13]]
200200
; CHECK-NEXT: [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], <i16 8, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
201201
; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i16> [[TMP17]] to <8 x i8>
202202
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP18]], <8 x i8> poison, <8 x i32> zeroinitializer
203-
; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX4]] to i64
203+
; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX3]] to i64
204204
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP20]]
205205
; CHECK-NEXT: store <8 x i8> [[TMP19]], ptr [[TMP21]], align 1
206-
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 8
207-
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 1000
206+
; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i32 [[INDEX3]], 8
207+
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT8]], 1000
208208
; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
209209
; CHECK: vec.epilog.middle.block:
210210
; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -268,19 +268,19 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 %
268268
; CHECK: vec.epilog.ph:
269269
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
270270
; CHECK: vec.epilog.vector.body:
271-
; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
271+
; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
272272
; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[A]] to i16
273273
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> undef, i16 [[TMP10]], i64 0
274274
; CHECK-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP11]], <i16 99, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>
275275
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP12]], <8 x i16> poison, <8 x i32> zeroinitializer
276276
; CHECK-NEXT: [[TMP14:%.*]] = lshr <8 x i16> [[TMP13]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
277277
; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[C]], <8 x i16> [[TMP14]], <8 x i16> [[TMP13]]
278278
; CHECK-NEXT: [[TMP16:%.*]] = trunc <8 x i16> [[TMP15]] to <8 x i8>
279-
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX2]] to i64
279+
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX1]] to i64
280280
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP17]]
281281
; CHECK-NEXT: store <8 x i8> [[TMP16]], ptr [[TMP18]], align 1
282-
; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 8
283-
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 1000
282+
; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 8
283+
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT4]], 1000
284284
; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
285285
; CHECK: vec.epilog.middle.block:
286286
; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]

0 commit comments

Comments
 (0)