Skip to content

Commit 1aeee98

Browse files
committed
[VPlan] Model address separately.
Move vector pointer generation to a separate VPInstruction opcode. This untangles address computation from the memory recipes future and is also needed to enable explicit unrolling in VPlan. Pull Request: #72164
1 parent 2ca101f commit 1aeee98

File tree

62 files changed

+849
-772
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+849
-772
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 15 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -8174,13 +8174,22 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
81748174
bool Consecutive =
81758175
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
81768176

8177+
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8178+
if (Decision != LoopVectorizationCostModel::CM_GatherScatter &&
8179+
Decision != LoopVectorizationCostModel::CM_Interleave) {
8180+
auto *VectorPtr = new VPInstruction(
8181+
Reverse ? VPInstruction::VectorPtrReverse : VPInstruction::VectorPtr,
8182+
{Ptr}, I->getDebugLoc());
8183+
Builder.getInsertBlock()->appendRecipe(VectorPtr);
8184+
Ptr = VectorPtr;
8185+
}
81778186
if (LoadInst *Load = dyn_cast<LoadInst>(I))
8178-
return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8179-
Consecutive, Reverse);
8187+
return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8188+
Reverse);
81808189

81818190
StoreInst *Store = cast<StoreInst>(I);
8182-
return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8183-
Mask, Consecutive, Reverse);
8191+
return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8192+
Consecutive, Reverse);
81848193
}
81858194

81868195
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -9485,44 +9494,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
94859494
}
94869495
}
94879496

9488-
const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9489-
// Calculate the pointer for the specific unroll-part.
9490-
Value *PartPtr = nullptr;
9491-
9492-
// Use i32 for the gep index type when the value is constant,
9493-
// or query DataLayout for a more suitable index type otherwise.
9494-
const DataLayout &DL =
9495-
Builder.GetInsertBlock()->getModule()->getDataLayout();
9496-
Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
9497-
? DL.getIndexType(PointerType::getUnqual(
9498-
ScalarDataTy->getContext()))
9499-
: Builder.getInt32Ty();
9500-
bool InBounds = false;
9501-
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9502-
InBounds = gep->isInBounds();
9503-
if (isReverse()) {
9504-
// If the address is consecutive but reversed, then the
9505-
// wide store needs to start at the last vector element.
9506-
// RunTimeVF = VScale * VF.getKnownMinValue()
9507-
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9508-
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
9509-
// NumElt = -Part * RunTimeVF
9510-
Value *NumElt =
9511-
Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
9512-
// LastLane = 1 - RunTimeVF
9513-
Value *LastLane =
9514-
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
9515-
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
9516-
PartPtr =
9517-
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
9518-
} else {
9519-
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
9520-
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
9521-
}
9522-
9523-
return PartPtr;
9524-
};
9525-
95269497
// Handle Stores:
95279498
if (SI) {
95289499
State.setDebugLocFrom(SI->getDebugLoc());
@@ -9543,8 +9514,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
95439514
// We don't want to update the value in the map as it might be used in
95449515
// another expression. So don't call resetVectorValue(StoredVal).
95459516
}
9546-
auto *VecPtr =
9547-
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9517+
auto *VecPtr = State.get(getAddr(), Part);
95489518
if (isMaskRequired)
95499519
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
95509520
BlockInMaskParts[Part]);
@@ -9568,8 +9538,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
95689538
nullptr, "wide.masked.gather");
95699539
State.addMetadata(NewLI, LI);
95709540
} else {
9571-
auto *VecPtr =
9572-
CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9541+
auto *VecPtr = State.get(getAddr(), Part);
95739542
if (isMaskRequired)
95749543
NewLI = Builder.CreateMaskedLoad(
95759544
DataTy, VecPtr, Alignment, BlockInMaskParts[Part],

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1061,7 +1061,9 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
10611061
// Increment the canonical IV separately for each unrolled part.
10621062
CanonicalIVIncrementForPart,
10631063
BranchOnCount,
1064-
BranchOnCond
1064+
BranchOnCond,
1065+
VectorPtr,
1066+
VectorPtrReverse
10651067
};
10661068

10671069
private:
@@ -1168,6 +1170,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
11681170
case VPInstruction::CalculateTripCountMinusVF:
11691171
case VPInstruction::CanonicalIVIncrementForPart:
11701172
case VPInstruction::BranchOnCount:
1173+
case VPInstruction::VectorPtr:
1174+
case VPInstruction::VectorPtrReverse:
11711175
return true;
11721176
};
11731177
llvm_unreachable("switch should return");

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
121121
case VPInstruction::Not:
122122
case VPInstruction::CalculateTripCountMinusVF:
123123
case VPInstruction::CanonicalIVIncrementForPart:
124+
case VPInstruction::VectorPtr:
125+
case VPInstruction::VectorPtrReverse:
124126
return false;
125127
default:
126128
return true;
@@ -397,6 +399,50 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
397399
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
398400
return CondBr;
399401
}
402+
case VPInstruction::VectorPtr:
403+
case VPInstruction::VectorPtrReverse: {
404+
// Calculate the pointer for the specific unroll-part.
405+
Value *PartPtr = nullptr;
406+
bool IsReverse = getOpcode() == VPInstruction::VectorPtrReverse;
407+
auto *MemR = cast<VPWidenMemoryInstructionRecipe>(*user_begin());
408+
Type *ScalarDataTy =
409+
MemR->isStore() ? cast<StoreInst>(&MemR->getIngredient())
410+
->getValueOperand()
411+
->getType()
412+
: cast<LoadInst>(&MemR->getIngredient())->getType();
413+
// Use i32 for the gep index type when the value is constant,
414+
// or query DataLayout for a more suitable index type otherwise.
415+
const DataLayout &DL =
416+
Builder.GetInsertBlock()->getModule()->getDataLayout();
417+
Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
418+
? DL.getIndexType(ScalarDataTy->getPointerTo())
419+
: Builder.getInt32Ty();
420+
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
421+
bool InBounds = false;
422+
if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
423+
InBounds = GEP->isInBounds();
424+
if (IsReverse) {
425+
// If the address is consecutive but reversed, then the
426+
// wide store needs to start at the last vector element.
427+
// RunTimeVF = VScale * VF.getKnownMinValue()
428+
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
429+
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
430+
// NumElt = -Part * RunTimeVF
431+
Value *NumElt = Builder.CreateMul(
432+
ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
433+
// LastLane = 1 - RunTimeVF
434+
Value *LastLane =
435+
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
436+
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
437+
PartPtr =
438+
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
439+
} else {
440+
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
441+
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
442+
}
443+
444+
return PartPtr;
445+
}
400446
default:
401447
llvm_unreachable("Unsupported opcode for instruction");
402448
}
@@ -473,6 +519,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
473519
case VPInstruction::BranchOnCount:
474520
O << "branch-on-count";
475521
break;
522+
case VPInstruction::VectorPtr:
523+
O << "vector-pointer";
524+
break;
525+
case VPInstruction::VectorPtrReverse:
526+
O << "vector-pointer-reverse";
527+
break;
476528
default:
477529
O << Instruction::getOpcodeName(getOpcode());
478530
}

llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,8 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
176176
; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i16> [[TMP8]] to <16 x i8>
177177
; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[INDEX]] to i64
178178
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP11]]
179-
; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
180179
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 16
180+
; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
181181
; CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[TMP13]], align 1
182182
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
183183
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
@@ -459,8 +459,8 @@ define void @old_and_new_size_equalko(ptr noalias %src, ptr noalias %dst) {
459459
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
460460
; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
461461
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
462-
; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP1]], align 4
463462
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
463+
; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP1]], align 4
464464
; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP2]], align 4
465465
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
466466
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000

0 commit comments

Comments
 (0)