-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV][VPlan] When the load/store stride is -1, use vle/vse instead of vlse/vsse #130032
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers Author: LiqinWeng (LiqinWeng) ChangesFull diff: https://github.com/llvm/llvm-project/pull/130032.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c447fa4843591..0b155c95f0221 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7487,6 +7487,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
// comparing against the legacy cost isn't desirable.
if (isa<VPPartialReductionRecipe>(&R))
return true;
+
+ // The VPlan-based cost model may calculate the cost of strided load/store
+ // which can't be modeled in the legacy cost model.
+ if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R))
+ if (cast<VPWidenMemoryRecipe>(&R)->isReverse())
+ return true;
+
if (Instruction *UI = GetInstructionForCost(&R))
SeenInstrs.insert(UI);
}
@@ -8344,7 +8351,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
- if (Reverse) {
+ if (Reverse && !CM.foldTailWithEVL()) {
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e9f50e88867b2..2fd2cbffec1c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2640,17 +2640,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
- Value *EVL, const Twine &Name) {
- VectorType *ValTy = cast<VectorType>(Operand->getType());
- Value *AllTrueMask =
- Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
- return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
- {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
auto *LI = cast<LoadInst>(&Ingredient);
@@ -2665,19 +2654,26 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Addr = State.get(getAddr(), !CreateGather);
Value *Mask = nullptr;
- if (VPValue *VPMask = getMask()) {
+ if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
- } else {
+ else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- }
if (CreateGather) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
} else {
+ if (isReverse()) {
+ auto *EltTy = DataTy->getElementType();
+ // if (EltTy->getScalarSizeInBits() !=
+ // EVL->getType()->getScalarSizeInBits())
+ // EVL = ConstantInt::getSigned(EVL->getType(),
+ // static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+ auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+ Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+ Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+ }
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
@@ -2686,10 +2682,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
State.addMetadata(NewLI, LI);
- Instruction *Res = NewLI;
- if (isReverse())
- Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
- State.set(this, Res);
+ State.set(this, NewLI);
}
InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
@@ -2707,14 +2700,13 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
- return Cost;
+ return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+ AS, Ctx.CostKind);
- return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, Ctx.CostKind,
- 0);
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
+ getAddr()->getUnderlyingValue(), false,
+ Alignment, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2775,7 +2767,8 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
-
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
VPValue *StoredValue = getStoredValue();
bool CreateScatter = !isConsecutive();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
@@ -2786,22 +2779,32 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
- if (isReverse())
- StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
- if (VPValue *VPMask = getMask()) {
+ if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
- } else {
+ else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- }
+
Value *Addr = State.get(getAddr(), !CreateScatter);
if (CreateScatter) {
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
+ if (isReverse()) {
+ auto *EltTy = DataTy->getElementType();
+ // FIXME: we may need not deal with the size, the InstCombine will deal
+ // with the Offset Type if (EltTy->getScalarSizeInBits() !=
+ // EVL->getType()->getScalarSizeInBits())
+ // EVL = ConstantInt::getSigned(EVL->getType(),
+ // static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+ auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+ // Value *Offset =
+ // Builder.CreateSub(State.Builder.getIntN(EVL->getType()->getScalarSizeInBits(),
+ // 1), EVL);
+ Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+ Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+ }
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
@@ -2828,14 +2831,13 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
- return Cost;
+ return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+ AS, Ctx.CostKind);
- return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, Ctx.CostKind,
- 0);
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
+ getAddr()->getUnderlyingValue(), false,
+ Alignment, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
@llvm/pr-subscribers-llvm-transforms Author: LiqinWeng (LiqinWeng) ChangesFull diff: https://github.com/llvm/llvm-project/pull/130032.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c447fa4843591..0b155c95f0221 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7487,6 +7487,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
// comparing against the legacy cost isn't desirable.
if (isa<VPPartialReductionRecipe>(&R))
return true;
+
+ // The VPlan-based cost model may calculate the cost of strided load/store
+ // which can't be modeled in the legacy cost model.
+ if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R))
+ if (cast<VPWidenMemoryRecipe>(&R)->isReverse())
+ return true;
+
if (Instruction *UI = GetInstructionForCost(&R))
SeenInstrs.insert(UI);
}
@@ -8344,7 +8351,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
- if (Reverse) {
+ if (Reverse && !CM.foldTailWithEVL()) {
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e9f50e88867b2..2fd2cbffec1c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2640,17 +2640,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
- Value *EVL, const Twine &Name) {
- VectorType *ValTy = cast<VectorType>(Operand->getType());
- Value *AllTrueMask =
- Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
- return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
- {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
auto *LI = cast<LoadInst>(&Ingredient);
@@ -2665,19 +2654,26 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Addr = State.get(getAddr(), !CreateGather);
Value *Mask = nullptr;
- if (VPValue *VPMask = getMask()) {
+ if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
- } else {
+ else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- }
if (CreateGather) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
} else {
+ if (isReverse()) {
+ auto *EltTy = DataTy->getElementType();
+ // if (EltTy->getScalarSizeInBits() !=
+ // EVL->getType()->getScalarSizeInBits())
+ // EVL = ConstantInt::getSigned(EVL->getType(),
+ // static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+ auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+ Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+ Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+ }
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
@@ -2686,10 +2682,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
State.addMetadata(NewLI, LI);
- Instruction *Res = NewLI;
- if (isReverse())
- Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
- State.set(this, Res);
+ State.set(this, NewLI);
}
InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
@@ -2707,14 +2700,13 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
- return Cost;
+ return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+ AS, Ctx.CostKind);
- return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, Ctx.CostKind,
- 0);
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
+ getAddr()->getUnderlyingValue(), false,
+ Alignment, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2775,7 +2767,8 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
-
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
VPValue *StoredValue = getStoredValue();
bool CreateScatter = !isConsecutive();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
@@ -2786,22 +2779,32 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
- if (isReverse())
- StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
- if (VPValue *VPMask = getMask()) {
+ if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
- } else {
+ else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
- }
+
Value *Addr = State.get(getAddr(), !CreateScatter);
if (CreateScatter) {
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
+ if (isReverse()) {
+ auto *EltTy = DataTy->getElementType();
+ // FIXME: we may need not deal with the size, the InstCombine will deal
+ // with the Offset Type if (EltTy->getScalarSizeInBits() !=
+ // EVL->getType()->getScalarSizeInBits())
+ // EVL = ConstantInt::getSigned(EVL->getType(),
+ // static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+ auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+ // Value *Offset =
+ // Builder.CreateSub(State.Builder.getIntN(EVL->getType()->getScalarSizeInBits(),
+ // 1), EVL);
+ Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+ Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+ }
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
@@ -2828,14 +2831,13 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
- return Cost;
+ return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+ AS, Ctx.CostKind);
- return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, Ctx.CostKind,
- 0);
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
+ getAddr()->getUnderlyingValue(), false,
+ Alignment, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
7b66981
to
344c2e1
Compare
344c2e1
to
6f5f6c2
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this related/overlaps-with @Mel-Chen 's https://github.com/llvm/llvm-project/pull/128718/files?
Would be helpful to have a more detailed patch description instead of a screenshot
no,My idea is that when stride is -1, I set the Offset value to (1-EVL), so that the instructions used in the final translation to the assembly level are VLE/VSE instead of VLSE/VSSE. But I'm not sure if this will be a problem, do you have any ideas? |
refer:123608 |
I don’t think this works as intended. Have you verified the correctness of the TSVC results? Take reverse load as an example. Suppose VF = 4 and we want to load starting from index n. The expected index vector should be [n, n-1, n-2, n-3]. However, if EVL = get.vector.length(TripCount) = 4, this patch will compute the starting index as n-3, and then perform a consecutive load of 4 elements — that is, it loads from [n-3, n-2, n-1, n]. This is not the same as the original reverse load pattern, which requires [n, n-1, n-2, n-3]. |
The number of elements processed in each iteration of the loop is EVL. I tried to calculate the offsets of the current load/store by 1-EVL.In this way, the continuous value is changed from the reverse direction to the forward direction.

The TSCV test results are as follows: