Skip to content

Commit f18536d

Browse files
authored
[VPlan] Model address separately. (#72164)
Move vector pointer generation to a separate VPVectorPointerRecipe. This untangles address computation from the memory recipes future and is also needed to enable explicit unrolling in VPlan. #72164
1 parent ff80414 commit f18536d

File tree

72 files changed

+901
-798
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+901
-798
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 13 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -8174,13 +8174,20 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
81748174
bool Consecutive =
81758175
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
81768176

8177+
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8178+
if (Consecutive) {
8179+
auto *VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8180+
Reverse, I->getDebugLoc());
8181+
Builder.getInsertBlock()->appendRecipe(VectorPtr);
8182+
Ptr = VectorPtr;
8183+
}
81778184
if (LoadInst *Load = dyn_cast<LoadInst>(I))
8178-
return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8179-
Consecutive, Reverse);
8185+
return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8186+
Reverse);
81808187

81818188
StoreInst *Store = cast<StoreInst>(I);
8182-
return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8183-
Mask, Consecutive, Reverse);
8189+
return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8190+
Consecutive, Reverse);
81848191
}
81858192

81868193
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -9485,44 +9492,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
94859492
}
94869493
}
94879494

9488-
const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9489-
// Calculate the pointer for the specific unroll-part.
9490-
Value *PartPtr = nullptr;
9491-
9492-
// Use i32 for the gep index type when the value is constant,
9493-
// or query DataLayout for a more suitable index type otherwise.
9494-
const DataLayout &DL =
9495-
Builder.GetInsertBlock()->getModule()->getDataLayout();
9496-
Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
9497-
? DL.getIndexType(PointerType::getUnqual(
9498-
ScalarDataTy->getContext()))
9499-
: Builder.getInt32Ty();
9500-
bool InBounds = false;
9501-
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9502-
InBounds = gep->isInBounds();
9503-
if (isReverse()) {
9504-
// If the address is consecutive but reversed, then the
9505-
// wide store needs to start at the last vector element.
9506-
// RunTimeVF = VScale * VF.getKnownMinValue()
9507-
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9508-
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
9509-
// NumElt = -Part * RunTimeVF
9510-
Value *NumElt =
9511-
Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
9512-
// LastLane = 1 - RunTimeVF
9513-
Value *LastLane =
9514-
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
9515-
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
9516-
PartPtr =
9517-
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
9518-
} else {
9519-
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
9520-
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
9521-
}
9522-
9523-
return PartPtr;
9524-
};
9525-
95269495
// Handle Stores:
95279496
if (SI) {
95289497
State.setDebugLocFrom(SI->getDebugLoc());
@@ -9543,8 +9512,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
95439512
// We don't want to update the value in the map as it might be used in
95449513
// another expression. So don't call resetVectorValue(StoredVal).
95459514
}
9546-
auto *VecPtr =
9547-
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9515+
auto *VecPtr = State.get(getAddr(), Part);
95489516
if (isMaskRequired)
95499517
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
95509518
BlockInMaskParts[Part]);
@@ -9568,8 +9536,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
95689536
nullptr, "wide.masked.gather");
95699537
State.addMetadata(NewLI, LI);
95709538
} else {
9571-
auto *VecPtr =
9572-
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9539+
auto *VecPtr = State.get(getAddr(), Part);
95739540
if (isMaskRequired)
95749541
NewLI = Builder.CreateMaskedLoad(
95759542
DataTy, VecPtr, Alignment, BlockInMaskParts[Part],

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1357,6 +1357,36 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
13571357
#endif
13581358
};
13591359

1360+
/// A recipe to compute the pointers for widened memory accesses of IndexTy for
1361+
/// all parts. If IsReverse is true, compute pointers for accessing the input in
1362+
/// reverse order per part.
1363+
class VPVectorPointerRecipe : public VPRecipeBase, public VPValue {
1364+
Type *IndexedTy;
1365+
bool IsReverse;
1366+
1367+
public:
1368+
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse,
1369+
DebugLoc DL)
1370+
: VPRecipeBase(VPDef::VPVectorPointerSC, {Ptr}, DL), VPValue(this),
1371+
IndexedTy(IndexedTy), IsReverse(IsReverse) {}
1372+
1373+
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
1374+
1375+
void execute(VPTransformState &State) override;
1376+
1377+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1378+
assert(is_contained(operands(), Op) &&
1379+
"Op must be an operand of the recipe");
1380+
return true;
1381+
}
1382+
1383+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1384+
/// Print the recipe.
1385+
void print(raw_ostream &O, const Twine &Indent,
1386+
VPSlotTracker &SlotTracker) const override;
1387+
#endif
1388+
};
1389+
13601390
/// A pure virtual base class for all recipes modeling header phis, including
13611391
/// phis for first order recurrences, pointer inductions and reductions. The
13621392
/// start value is the first operand of the recipe and the incoming value from

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1209,6 +1209,59 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
12091209
}
12101210
#endif
12111211

1212+
void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1213+
auto &Builder = State.Builder;
1214+
State.setDebugLocFrom(getDebugLoc());
1215+
for (unsigned Part = 0; Part < State.UF; ++Part) {
1216+
// Calculate the pointer for the specific unroll-part.
1217+
Value *PartPtr = nullptr;
1218+
// Use i32 for the gep index type when the value is constant,
1219+
// or query DataLayout for a more suitable index type otherwise.
1220+
const DataLayout &DL =
1221+
Builder.GetInsertBlock()->getModule()->getDataLayout();
1222+
Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1223+
? DL.getIndexType(IndexedTy->getPointerTo())
1224+
: Builder.getInt32Ty();
1225+
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1226+
bool InBounds = false;
1227+
if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
1228+
InBounds = GEP->isInBounds();
1229+
if (IsReverse) {
1230+
// If the address is consecutive but reversed, then the
1231+
// wide store needs to start at the last vector element.
1232+
// RunTimeVF = VScale * VF.getKnownMinValue()
1233+
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1234+
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1235+
// NumElt = -Part * RunTimeVF
1236+
Value *NumElt = Builder.CreateMul(
1237+
ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1238+
// LastLane = 1 - RunTimeVF
1239+
Value *LastLane =
1240+
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1241+
PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1242+
PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1243+
} else {
1244+
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1245+
PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1246+
}
1247+
1248+
State.set(this, PartPtr, Part);
1249+
}
1250+
}
1251+
1252+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1253+
void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1254+
VPSlotTracker &SlotTracker) const {
1255+
O << Indent;
1256+
printAsOperand(O, SlotTracker);
1257+
O << " = vector-pointer ";
1258+
if (IsReverse)
1259+
O << "(reverse) ";
1260+
1261+
printOperands(O, SlotTracker);
1262+
}
1263+
#endif
1264+
12121265
void VPBlendRecipe::execute(VPTransformState &State) {
12131266
State.setDebugLocFrom(getDebugLoc());
12141267
// We know that all PHIs in non-header blocks are converted into

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ class VPDef {
351351
VPReductionSC,
352352
VPReplicateSC,
353353
VPScalarIVStepsSC,
354+
VPVectorPointerSC,
354355
VPWidenCallSC,
355356
VPWidenCanonicalIVSC,
356357
VPWidenCastSC,

llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,8 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
176176
; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i16> [[TMP8]] to <16 x i8>
177177
; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[INDEX]] to i64
178178
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP11]]
179-
; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
180179
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 16
180+
; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
181181
; CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[TMP13]], align 1
182182
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
183183
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
@@ -459,8 +459,8 @@ define void @old_and_new_size_equalko(ptr noalias %src, ptr noalias %dst) {
459459
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
460460
; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
461461
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
462-
; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP1]], align 4
463462
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
463+
; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP1]], align 4
464464
; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP2]], align 4
465465
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
466466
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000

0 commit comments

Comments
 (0)