Skip to content

Commit 6f5f6c2

Browse files
committed
[LV][VPlan] When the load/store stride is -1, use vle/vse instead of vlse/vsse
1 parent a2ad656 commit 6f5f6c2

File tree

4 files changed

+75
-100
lines changed

4 files changed

+75
-100
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7061,6 +7061,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
70617061
RepR->getUnderlyingInstr(), VF))
70627062
return true;
70637063
}
7064+
7065+
// The VPlan-based cost model may calculate the cost of strided load/store
7066+
// which can't be modeled in the legacy cost model.
7067+
if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R))
7068+
if (cast<VPWidenMemoryRecipe>(&R)->isReverse())
7069+
return true;
7070+
70647071
if (Instruction *UI = GetInstructionForCost(&R)) {
70657072
// If we adjusted the predicate of the recipe, the cost in the legacy
70667073
// cost model may be different.
@@ -7758,7 +7765,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
77587765
auto *GEP = dyn_cast<GetElementPtrInst>(
77597766
Ptr->getUnderlyingValue()->stripPointerCasts());
77607767
VPSingleDefRecipe *VectorPtr;
7761-
if (Reverse) {
7768+
if (Reverse && !CM.foldTailWithEVL()) {
77627769
// When folding the tail, we may compute an address that we don't in the
77637770
// original scalar loop and it may not be inbounds. Drop Inbounds in that
77647771
// case.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 36 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2918,17 +2918,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
29182918
}
29192919
#endif
29202920

2921-
/// Use all-true mask for reverse rather than actual mask, as it avoids a
2922-
/// dependence w/o affecting the result.
2923-
static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
2924-
Value *EVL, const Twine &Name) {
2925-
VectorType *ValTy = cast<VectorType>(Operand->getType());
2926-
Value *AllTrueMask =
2927-
Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
2928-
return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
2929-
{Operand, AllTrueMask, EVL}, nullptr, Name);
2930-
}
2931-
29322921
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
29332922
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
29342923
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
@@ -2940,29 +2929,33 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
29402929
Value *EVL = State.get(getEVL(), VPLane(0));
29412930
Value *Addr = State.get(getAddr(), !CreateGather);
29422931
Value *Mask = nullptr;
2943-
if (VPValue *VPMask = getMask()) {
2932+
if (VPValue *VPMask = getMask())
29442933
Mask = State.get(VPMask);
2945-
if (isReverse())
2946-
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
2947-
} else {
2934+
else
29482935
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2949-
}
29502936

29512937
if (CreateGather) {
29522938
NewLI =
29532939
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
29542940
nullptr, "wide.masked.gather");
29552941
} else {
2942+
if (isReverse()) {
2943+
auto *EltTy = DataTy->getElementType();
2944+
// if (EltTy->getScalarSizeInBits() !=
2945+
// EVL->getType()->getScalarSizeInBits())
2946+
// EVL = ConstantInt::getSigned(EVL->getType(),
2947+
// static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
2948+
auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
2949+
Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
2950+
Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
2951+
}
29562952
NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
29572953
{Addr, Mask, EVL}, nullptr, "vp.op.load");
29582954
}
29592955
NewLI->addParamAttr(
29602956
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
29612957
applyMetadata(*NewLI);
2962-
Instruction *Res = NewLI;
2963-
if (isReverse())
2964-
Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
2965-
State.set(this, Res);
2958+
State.set(this, NewLI);
29662959
}
29672960

29682961
InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
@@ -2980,14 +2973,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
29802973
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
29812974
unsigned AS =
29822975
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2983-
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2984-
Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
2985-
if (!Reverse)
2986-
return Cost;
2987-
2988-
return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2989-
cast<VectorType>(Ty), {}, Ctx.CostKind,
2990-
0);
2976+
return Ctx.TTI.getMaskedMemoryOpCost(Instruction::Load, Ty, Alignment, AS,
2977+
Ctx.CostKind);
29912978
}
29922979

29932980
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3044,6 +3031,8 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
30443031
#endif
30453032

30463033
void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3034+
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3035+
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
30473036
VPValue *StoredValue = getStoredValue();
30483037
bool CreateScatter = !isConsecutive();
30493038
const Align Alignment = getLoadStoreAlignment(&Ingredient);
@@ -3053,22 +3042,32 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
30533042
CallInst *NewSI = nullptr;
30543043
Value *StoredVal = State.get(StoredValue);
30553044
Value *EVL = State.get(getEVL(), VPLane(0));
3056-
if (isReverse())
3057-
StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
30583045
Value *Mask = nullptr;
3059-
if (VPValue *VPMask = getMask()) {
3046+
if (VPValue *VPMask = getMask())
30603047
Mask = State.get(VPMask);
3061-
if (isReverse())
3062-
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3063-
} else {
3048+
else
30643049
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3065-
}
3050+
30663051
Value *Addr = State.get(getAddr(), !CreateScatter);
30673052
if (CreateScatter) {
30683053
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
30693054
Intrinsic::vp_scatter,
30703055
{StoredVal, Addr, Mask, EVL});
30713056
} else {
3057+
if (isReverse()) {
3058+
auto *EltTy = DataTy->getElementType();
3059+
// FIXME: we may need not deal with the size, the InstCombine will deal
3060+
// with the Offset Type if (EltTy->getScalarSizeInBits() !=
3061+
// EVL->getType()->getScalarSizeInBits())
3062+
// EVL = ConstantInt::getSigned(EVL->getType(),
3063+
// static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
3064+
auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
3065+
// Value *Offset =
3066+
// Builder.CreateSub(State.Builder.getIntN(EVL->getType()->getScalarSizeInBits(),
3067+
// 1), EVL);
3068+
Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
3069+
Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
3070+
}
30723071
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
30733072
Intrinsic::vp_store,
30743073
{StoredVal, Addr, Mask, EVL});
@@ -3093,14 +3092,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
30933092
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
30943093
unsigned AS =
30953094
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
3096-
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3097-
Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
3098-
if (!Reverse)
3099-
return Cost;
31003095

3101-
return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
3102-
cast<VectorType>(Ty), {}, Ctx.CostKind,
3103-
0);
3096+
return Ctx.TTI.getMaskedMemoryOpCost(Instruction::Store, Ty, Alignment, AS,
3097+
Ctx.CostKind);
31043098
}
31053099

31063100
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll

Lines changed: 27 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -31,21 +31,15 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
3131
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]]
3232
; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1
3333
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
34-
; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
35-
; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]]
36-
; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
37-
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
38-
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
34+
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
35+
; IF-EVL-NEXT: [[TMP9:%.*]] = sub i32 1, [[TMP5]]
36+
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP9]]
3937
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
40-
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
4138
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
42-
; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
43-
; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
44-
; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
45-
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
46-
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
47-
; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
48-
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
39+
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
40+
; IF-EVL-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP5]]
41+
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 [[TMP16]]
42+
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD]], ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
4943
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
5044
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
5145
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -134,23 +128,15 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
134128
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
135129
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
136130
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
137-
; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
138-
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]]
139-
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
140-
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
141-
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
142-
; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
143-
; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
144-
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
131+
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
132+
; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP5]]
133+
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 [[TMP19]]
134+
; IF-EVL-NEXT: [[VP_OP_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP14]], i32 [[TMP5]])
145135
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
146-
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
147-
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
148-
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
149-
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
150-
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
151-
; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
152-
; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
153-
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
136+
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
137+
; IF-EVL-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP5]]
138+
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP15]], i32 [[TMP22]]
139+
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD2]], ptr align 4 [[TMP17]], <vscale x 4 x i1> [[TMP14]], i32 [[TMP5]])
154140
; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
155141
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
156142
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -259,31 +245,22 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
259245
; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
260246
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
261247
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
262-
; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
263-
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP9]]
264-
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]]
265-
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
266-
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
248+
; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
249+
; IF-EVL-NEXT: [[TMP9:%.*]] = sub i32 1, [[TMP6]]
250+
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 [[TMP9]]
267251
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
268-
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
269-
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
252+
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_OP_LOAD]]
270253
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
271254
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
272-
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
273-
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]]
274-
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
275-
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
276-
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
277-
; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
278-
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
255+
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
256+
; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP6]]
257+
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP12]], i32 [[TMP19]]
258+
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
279259
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]]
280-
; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
281-
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]]
282-
; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]]
283-
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
284-
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
285-
; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
286-
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
260+
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
261+
; IF-EVL-NEXT: [[TMP17:%.*]] = sub i32 1, [[TMP6]]
262+
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP16]], i32 [[TMP17]]
263+
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP18]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
287264
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
288265
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
289266
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,10 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
3636
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
3737
; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]]
3838
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
39-
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
40-
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]]
41-
; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]]
42-
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]]
43-
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
44-
; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
45-
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP19]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
39+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i32 0
40+
; CHECK-NEXT: [[TMP14:%.*]] = sub i32 1, [[TMP11]]
41+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[TMP13]], i32 [[TMP14]]
42+
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
4643
; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
4744
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
4845
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]

0 commit comments

Comments
 (0)