Skip to content

[LV][VPlan] When the load/store stride is -1, use vle/vse instead of vlse/vsse #130032

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7062,6 +7062,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
RepR->getUnderlyingInstr(), VF))
return true;
}

// The VPlan-based cost model may calculate the cost of strided load/store
// which can't be modeled in the legacy cost model.
if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R))
if (cast<VPWidenMemoryRecipe>(&R)->isReverse())
return true;

if (Instruction *UI = GetInstructionForCost(&R)) {
// If we adjusted the predicate of the recipe, the cost in the legacy
// cost model may be different.
Expand Down Expand Up @@ -7761,7 +7768,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
if (Reverse) {
if (Reverse && !CM.foldTailWithEVL()) {
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
Expand Down
77 changes: 36 additions & 41 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2921,17 +2921,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif

/// Use all-true mask for reverse rather than actual mask, as it avoids a
/// dependence w/o affecting the result.
static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
Value *EVL, const Twine &Name) {
VectorType *ValTy = cast<VectorType>(Operand->getType());
Value *AllTrueMask =
Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
{Operand, AllTrueMask, EVL}, nullptr, Name);
}

void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
Expand All @@ -2943,29 +2932,33 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Addr = State.get(getAddr(), !CreateGather);
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
if (isReverse())
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}

if (CreateGather) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
} else {
if (isReverse()) {
auto *EltTy = DataTy->getElementType();
// if (EltTy->getScalarSizeInBits() !=
// EVL->getType()->getScalarSizeInBits())
// EVL = ConstantInt::getSigned(EVL->getType(),
// static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
}
NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
{Addr, Mask, EVL}, nullptr, "vp.op.load");
}
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
Instruction *Res = NewLI;
if (isReverse())
Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
State.set(this, Res);
State.set(this, NewLI);
}

InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
Expand All @@ -2983,14 +2976,9 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;

return Cost + Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
return Ctx.TTI.getMaskedMemoryOpCost(Instruction::Load, Ty, Alignment, AS,
Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down Expand Up @@ -3047,6 +3035,8 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
#endif

void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
VPValue *StoredValue = getStoredValue();
bool CreateScatter = !isConsecutive();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
Expand All @@ -3056,22 +3046,32 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
if (isReverse())
StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
if (isReverse())
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}

Value *Addr = State.get(getAddr(), !CreateScatter);
if (CreateScatter) {
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
if (isReverse()) {
auto *EltTy = DataTy->getElementType();
// FIXME: we may need not deal with the size, the InstCombine will deal
// with the Offset Type if (EltTy->getScalarSizeInBits() !=
// EVL->getType()->getScalarSizeInBits())
// EVL = ConstantInt::getSigned(EVL->getType(),
// static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
// Value *Offset =
// Builder.CreateSub(State.Builder.getIntN(EVL->getType()->getScalarSizeInBits(),
// 1), EVL);
Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
}
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_store,
{StoredVal, Addr, Mask, EVL});
Expand All @@ -3096,14 +3096,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;

return Cost + Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
return Ctx.TTI.getMaskedMemoryOpCost(Instruction::Store, Ty, Alignment, AS,
Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,15 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]]
; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
; IF-EVL-NEXT: [[TMP9:%.*]] = sub i32 1, [[TMP5]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP9]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
; IF-EVL-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP5]]
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 [[TMP16]]
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD]], ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
Expand Down Expand Up @@ -134,23 +128,15 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]]
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP5]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 [[TMP19]]
; IF-EVL-NEXT: [[VP_OP_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP14]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
; IF-EVL-NEXT: [[TMP22:%.*]] = sub i32 1, [[TMP5]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP15]], i32 [[TMP22]]
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD2]], ptr align 4 [[TMP17]], <vscale x 4 x i1> [[TMP14]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
Expand Down Expand Up @@ -259,31 +245,22 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP9]]
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
; IF-EVL-NEXT: [[TMP9:%.*]] = sub i32 1, [[TMP6]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 [[TMP9]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_OP_LOAD]]
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]]
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
; IF-EVL-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP6]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP12]], i32 [[TMP19]]
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]]
; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
; IF-EVL-NEXT: [[TMP17:%.*]] = sub i32 1, [[TMP6]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP16]], i32 [[TMP17]]
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP18]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,10 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP12:%.*]] = sub nuw nsw i64 1, [[OFFSET_IDX]]
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]]
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i64 [[TMP16]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]]
; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP19]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[ARRAYIDX13]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = sub i32 1, [[TMP11]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[TMP13]], i32 [[TMP14]]
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
Expand Down
Loading