-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV] Support strided load with a stride of -1 #128718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Mel Chen (Mel-Chen) ChangesPatch is 118.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128718.diff 9 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..6635a91563384 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1143,6 +1143,7 @@ class LoopVectorizationCostModel {
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
+ CM_Strided,
CM_Scalarize,
CM_VectorCall,
CM_IntrinsicCall
@@ -1308,6 +1309,18 @@ class LoopVectorizationCostModel {
(SI && TTI.isLegalMaskedScatter(Ty, Align));
}
+ /// Returns true if the target machine can represent \p V as a strided load
+ /// or store operation.
+ bool isLegalStridedLoadStore(Value *V, ElementCount VF) {
+ if (!isa<LoadInst, StoreInst>(V))
+ return false;
+ auto *Ty = getLoadStoreType(V);
+ Align Align = getLoadStoreAlignment(V);
+ if (VF.isVector())
+ Ty = VectorType::get(Ty, VF);
+ return TTI.isLegalStridedLoadStore(Ty, Align);
+ }
+
/// Returns true if the target machine supports all of the reduction
/// variables found for the given VF.
bool canVectorizeReductions(ElementCount VF) const {
@@ -1645,6 +1658,9 @@ class LoopVectorizationCostModel {
/// element)
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
+ /// The cost computation for strided load/store instruction.
+ InstructionCost getStridedLoadStoreCost(Instruction *I, ElementCount VF);
+
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Instruction *I,
@@ -5813,6 +5829,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
return Cost;
}
+InstructionCost
+LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
+ ElementCount VF) {
+ Type *ValTy = getLoadStoreType(I);
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
+ const Value *Ptr = getLoadStorePointerOperand(I);
+
+ return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
+ Legal->isMaskRequired(I), Alignment,
+ CostKind, I);
+}
+
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
ElementCount VF,
@@ -6132,6 +6161,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
"Expected consecutive stride.");
InstWidening Decision =
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+ // Consider using strided load/store for consecutive reverse accesses to
+ // achieve more efficient memory operations.
+ if (ConsecutiveStride == -1) {
+ const InstructionCost StridedLoadStoreCost =
+ isLegalStridedLoadStore(&I, VF) ? getStridedLoadStoreCost(&I, VF)
+ : InstructionCost::getInvalid();
+ if (StridedLoadStoreCost < Cost) {
+ Decision = CM_Strided;
+ Cost = StridedLoadStoreCost;
+ }
+ }
setWideningDecision(&I, VF, Decision, Cost);
continue;
}
@@ -6777,6 +6817,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return TTI::CastContextHint::Normal;
switch (getWideningDecision(I, VF)) {
+ // TODO: New CastContextHint for strided accesses.
+ case LoopVectorizationCostModel::CM_Strided:
case LoopVectorizationCostModel::CM_GatherScatter:
return TTI::CastContextHint::GatherScatter;
case LoopVectorizationCostModel::CM_Interleave:
@@ -8335,6 +8377,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
@@ -8362,11 +8405,11 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- I->getDebugLoc());
+ Strided, I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, I->getDebugLoc());
+ Reverse, Strided, I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8089cfd1ce802..34b4c2c8a0be1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2611,6 +2611,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// Whether the consecutive accessed addresses are in reverse order.
bool Reverse;
+ /// Whether the accessed addresses are evenly spaced apart by a fixed stride.
+ bool Strided;
+
/// Whether the memory access is masked.
bool IsMasked = false;
@@ -2624,9 +2627,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse, DebugLoc DL)
+ bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
- Reverse(Reverse) {
+ Reverse(Reverse), Strided(Strided) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
}
@@ -2654,6 +2657,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// order.
bool isReverse() const { return Reverse; }
+ /// Return whether the accessed addresses are evenly spaced apart by a fixed
+ /// stride.
+ bool isStrided() const { return Strided; }
+
/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }
@@ -2683,16 +2690,16 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// optional mask.
struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse, DebugLoc DL)
+ bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, DL),
+ Reverse, Strided, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse,
+ getMask(), Consecutive, Reverse, Strided,
getDebugLoc());
}
@@ -2724,7 +2731,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
{L.getAddr(), &EVL}, L.isConsecutive(),
- L.isReverse(), L.getDebugLoc()),
+ L.isReverse(), L.isStrided(), L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -2761,16 +2768,17 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
/// to store to and an optional mask.
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
- VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
+ VPValue *Mask, bool Consecutive, bool Reverse,
+ bool Strided, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, DL) {
+ Consecutive, Reverse, Strided, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, getDebugLoc());
+ Reverse, Strided, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -2804,7 +2812,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{S.getAddr(), S.getStoredValue(), &EVL},
- S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
+ S.isConsecutive(), S.isReverse(), S.isStrided(),
+ S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d57a6c481748c..e62ae690caa6d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2609,10 +2609,15 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
- return Ctx.TTI.getAddressComputationCost(Ty) +
- Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, Ctx.CostKind,
- &Ingredient);
+ if (Strided)
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
+ else
+ return Ctx.TTI.getAddressComputationCost(Ty) +
+ Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
}
InstructionCost Cost = 0;
@@ -2639,11 +2644,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive();
+ bool CreateGather = !isConsecutive() && !isStrided();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = nullptr;
+ Value *Mask = isStrided()
+ ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+ : nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2658,9 +2665,25 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
"wide.masked.gather");
} else if (Mask) {
- NewLI =
- Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
- PoisonValue::get(DataTy), "wide.masked.load");
+ if (isStrided()) {
+ const DataLayout &DL = LI->getDataLayout();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+ Value *RuntimeVF =
+ getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+ NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideVal, Mask, RuntimeVF}, nullptr, "wide.strided.load");
+ cast<CallInst>(NewLI)->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ } else {
+ NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.load");
+ }
} else {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
@@ -2698,7 +2721,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive();
+ bool CreateGather = !isConsecutive() && !isStrided();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
@@ -2718,6 +2741,16 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
+ } else if (isStrided()) {
+ const DataLayout &DL = LI->getDataLayout();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+ NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideVal, Mask, EVL}, nullptr, "wide.strided.load");
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
@@ -2772,13 +2805,15 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredVPValue = getStoredValue();
- bool CreateScatter = !isConsecutive();
+ bool CreateScatter = !isConsecutive() && !isStrided();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = nullptr;
+ Value *Mask = isStrided()
+ ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+ : nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2797,12 +2832,32 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
- if (CreateScatter)
+ if (CreateScatter) {
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
- else if (Mask)
- NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
- else
+ } else if (Mask) {
+ if (isStrided()) {
+ const DataLayout &DL = SI->getDataLayout();
+ auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+ Type *StoredEltTy = StoredVecTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+ Value *RuntimeVF =
+ getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+ NewSI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_store,
+ {StoredVecTy, PtrTy, StrideTy},
+ {StoredVal, Addr, StrideVal, Mask, RuntimeVF});
+ cast<CallInst>(NewSI)->addParamAttr(
+ 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
+ } else {
+ NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+ }
+ } else {
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
+ }
State.addMetadata(NewSI, SI);
}
@@ -2818,7 +2873,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredValue = getStoredValue();
- bool CreateScatter = !isConsecutive();
+ bool CreateScatter = !isConsecutive() && !isStrided();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
@@ -2843,11 +2898,25 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
- VectorBuilder VBuilder(Builder);
- VBuilder.setEVL(EVL).setMask(Mask);
- NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
- Instruction::Store, Type::getVoidTy(EVL->getContext()),
- {StoredVal, Addr}));
+ if (isStrided()) {
+ const DataLayout &DL = SI->getDataLayout();
+ auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+ Type *StoredEltTy = StoredVecTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+ NewSI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+ {StoredVecTy, PtrTy, StrideTy},
+ {StoredVal, Addr, StrideVal, Mask, EVL});
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Store, Type::getVoidTy(EVL->getContext()),
+ {StoredVal, Addr}));
+ }
}
NewSI->addParamAttr(
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6c917e4eef655..fd6984208a760 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -73,13 +73,13 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/,
+ false /*Consecutive*/, false /*Reverse*/, false /*Strided*/,
Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- Ingredient.getDebugLoc());
+ false /*Strided*/, Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
new file mode 100644
index 0000000000000..1aa01d7aabee9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -0,0 +1,642 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+;; This is the loop in c++ being vectorize in this file with
+;; vector.reverse
+;; #pragma clang loop vectorize_width(4, scalable)
+;; for (int i = N-1; i >= 0; --i)
+;; a[i] = b[i] ...
[truncated]
|
@@ -1308,6 +1309,18 @@ class LoopVectorizationCostModel { | |||
(SI && TTI.isLegalMaskedScatter(Ty, Align)); | |||
} | |||
|
|||
/// Returns true if the target machine can represent \p V as a strided load | |||
/// or store operation. | |||
bool isLegalStridedLoadStore(Value *V, ElementCount VF) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think better to have the format of the function, similar to isLegalGatherOrScatter, isLegalMaskedLoad etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I copy the function name from TTI function. Do you have any recommended names?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not about the name, more about the function parameters
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The prototype of isLegalGatherOrScatter is:
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
And the prototype of isLegalMaskedLoad is:
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment)
isLegalStridedLoadStore is the same as isLegalGatherOrScatter now.
Do you prefer the prototype of isLegalMaskedLoad?
✅ With the latest revision this PR passed the C/C++ code formatter. |
1966fd5
to
c54369e
Compare
This patch is ready for review. Please take a look, thanks. |
@@ -1312,6 +1313,22 @@ class LoopVectorizationCostModel { | |||
(SI && TTI.isLegalMaskedScatter(Ty, Align)); | |||
} | |||
|
|||
/// Returns true if the target machine can represent \p V as a strided load | |||
/// or store operation. | |||
bool isLegalStridedLoadStore(Value *V, ElementCount VF) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm suggesting to use the prototype of isLegalMaskedLoad here, most of the function do the analysis based on type/alignment info
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After consideration, I have decided to rename isLegalStridedLoadStore
to stridedAccessCanBeWidened
and align its prototype with memoryInstructionCanBeWidened
and interleavedAccessCanBeWidened
.
The reason is that, unlike isLegalGatherOrScatter
and isLegalMaskedLoad
, which primarily check whether the target supports certain operations, strided memory access requires additional checks, such as check stride. Its functionality is likely closer to memoryInstructionCanBeWidened
and interleavedAccessCanBeWidened
.
// TODO: Support non-unit-reverse strided accesses. | ||
Value *Index = | ||
Strided | ||
? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Strided
means negative stride? Would be good to clarify in the comment for Strided..
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just for now. Later it is supposed to support positive (and <-1 negative) and runtime stride values
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should Strided
eventually be some sort of std::optional<int>
? Or is the plan to also allow runtime strided values? In which case I presume we need to model it as a VPValue operand.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the initial patch for strided memory accesses, currently only supporting cases where the stride is -1. A follow-up patch will focus on stride analysis to support more strided memory accesses.
However, I believe it would be beneficial to discuss and decide how VPlan should handle stride in this initial patch. My current plan is to extend the existing VPWidenLoadRecipe/VPWidenStoreRecipe
and VPVectorPointerRecipe
by explicitly passing the stride into the recipe. For example:
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
VPValue *Stride,
bool Consecutive, bool Reverse, DebugLoc DL)
A memory access will only be considered strided when Stride
is explicitly provided (i.e., not nullptr).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Mel-Chen yes, that's a good idea. Since it will generate a vp intrinsic either way.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a bit confusing that VPVectorPointerRecipe w/o stride steps by positive increment and with Strided
by negative increment. Should it be VectorPointerEndRecipe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me explain why we need a vector-pointer here instead of a vector-end-pointer.
In the current implementation of reverse loads (CM_WidenReverse
), the vectorizer emits a regular load followed by a reverse.
For example, to load VF elements in reverse starting from index n (say VF = 4
), we want to access [n, n-1, n-2, n-3]
.
With load + reverse, we actually load [n-3, n-2, n-1, n]
, and then reverse the vector:
%L = load [n-3, n-2, n-1, n]
%RL = reverse %L
So in this case, the vector-end-pointer should get the pointer that points to n-3
.
However, when using a strided load with a negative stride to implement reverse loads, it behaves more like a regular consecutive load in terms of pointer computation:
%RL = strided.load n, -1
That’s why I chose to extend vector-pointer rather than rely on vector-end-pointer.
That said, I now think it would be better to explicitly pass in the stride to vector-pointer, rather than using a bool Strided
flag.
Alternatively, would you prefer introducing a new pointer recipe specifically for strided cases where the stride is not 1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@fhahn Do you think we should introduce a new pointer recipe, or is it better to just extend the existing vector-pointer recipe? I prefer the later one.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it should be fine to add a stride argument for now, with a brief comment documenting the behavior.
// TODO: Support non-unit-reverse strided accesses. | ||
Value *Index = | ||
Strided | ||
? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should Strided
eventually be some sort of std::optional<int>
? Or is the plan to also allow runtime strided values? In which case I presume we need to model it as a VPValue operand.
…pe. NFC After llvm#128718 lands there will be two ways of performing a reversed widened memory access, either by performing a consecutive unit-stride access and a reverse, or a strided access with a negative stride. Even though both produce a reversed vector, only the former needs VPReverseVectorPointerRecipe which computes a pointer to the last element of the widened vector. A strided reverse still needs a pointer to the first element of each part so it will use VPVectorPointerRecipe. This renames VPReverseVectorPointerRecipe to VPVectorEndPointerRecipe to clarify that a reversed access may not necessarily need a pointer to the last element.
@@ -1720,16 +1723,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, | |||
}; | |||
|
|||
/// A recipe to compute the pointers for widened memory accesses of IndexTy. | |||
/// Supports both consecutive and reverse consecutive accesses. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this still accurate? VPVectorPointerEndRecipe also is for reversed accesses?
class VPVectorPointerRecipe : public VPRecipeWithIRFlags, | ||
public VPUnrollPartAccessor<1> { | ||
Type *IndexedTy; | ||
|
||
/// Indicate whether to compute the pointer for strided memory accesses. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should document what the stride is.
// TODO: Support non-unit-reverse strided accesses. | ||
Value *Index = | ||
Strided | ||
? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a bit confusing that VPVectorPointerRecipe w/o stride steps by positive increment and with Strided
by negative increment. Should it be VectorPointerEndRecipe?
const InstructionCost StridedLoadStoreCost = | ||
Ctx.TTI.getStridedMemoryOpCost( | ||
Ingredient.getOpcode(), DataTy, | ||
getLoadStorePointerOperand(&Ingredient), MemR->isMasked(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to avoid passing the pointer operand from the ingredient here, as it may not be accurate any longer. What does the interface use the pointer operand for? Ideally we would pass the required info directly w/o having to pass an IR value. Do you think that would be feasible?
Otherwise, can we pass the underlying value from the pointer operand, if it exists?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getStridedMemoryOpCost
falls back to getGatherScatterOpCost
when the target doesn't support strided access.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
const Value *Ptr, bool VariableMask,
Align Alignment,
TTI::TargetCostKind CostKind,
const Instruction *I) const override {
// For a target without strided memory operations (or for an illegal
// operation type on one which does), assume we lower to a gather/scatter
// operation. (Which may in turn be scalarized.)
return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
Alignment, CostKind, I);
}
Since getGatherScatterOpCost
uses pointer operand, I went ahead with the latter approach you suggested.
8875c32
@@ -1743,18 +1746,20 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, | |||
|
|||
/// A recipe to compute the pointers for widened memory accesses of IndexTy. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to document the arguments
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
|
||
// The stride of consecutive reverse access must be -1. | ||
int64_t Stride = -1; | ||
auto *GEP = dyn_cast<GetElementPtrInst>(PtrUV->stripPointerCasts()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is stripping the pointer casts needed here? If it is just needed for the flags, can we get the wrap flags from the recipe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can't directly pass the GEPNoWrapFlags from VPVectorEndPointerRecipe here. Tail folding may cause VPVectorEndPointerRecipe to compute addresses that the original scalar loop wouldn't, which results in the removal of the 'inbounds' flag.
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
GEPNoWrapFlags Flags =
(CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
? GEPNoWrapFlags::none()
: GEPNoWrapFlags::inBounds();
VectorPtr = new VPVectorEndPointerRecipe(
Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
However, this doesn't apply to VPVectorPointerRecipe — if the original IR had the 'inbounds' flag, it should be preserved.
auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R); | ||
// TODO: support strided store | ||
// TODO: support strided accesses with stride not equal to -1 | ||
if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || !MemR->isReverse()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to check if MemR is consecutive here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we don't have to check if MemR is consecutive. There is an assertion in constructor of VPWidenMemoryRecipe:
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
So I add an assertion for consecutive.
72c17a1
// TODO: Support non-unit-reverse strided accesses. | ||
Value *Index = | ||
Strided | ||
? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it should be fine to add a stride argument for now, with a brief comment documenting the behavior.
|
||
auto *VecEndPtr = cast<VPVectorEndPointerRecipe>(MemR->getAddr()); | ||
VPValue *Ptr = VecEndPtr->getPtr(); | ||
Value *PtrUV = Ptr->getUnderlyingValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah I see, the Ptr argument isn't used by the RISCV TTI, but is potentially used by others; it appears passing nullptr here would work, but then it may cause issues with TTI implementations that use the default from BasicTTI?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, RISC-V TTI can omit passing the pointer because it implements getStridedMemoryOpCost and does not require full pointer information, so passing nullptr could work. However, this might cause issues for targets that do not implement getStridedMemoryOpCost.
Also cherry-pick the branch Mel-Chen:legalizeAndOptimizeInductions However, still not work well as collectLoopUniforms if the use-chain is too compilicated. :(
Still rely on CM_Strided to known legal and cost.
We should reopen it after supporting const strided accessess.
ping |
Do you know if/how this is related/interacts with #130032? |
Sorry, Sorry, I'll add a description. |
Initial patch to support strided memory accesses. Currently, only stride -1 is supported, with future extensions planned for additional strides and strided store.