Skip to content

[SLP]Add runtime stride support for strided loads. #81517

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 161 additions & 5 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
Expand Down Expand Up @@ -4014,6 +4015,126 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
});
}

/// Checks if the provided list of pointers \p Pointers represents the strided
/// pointers for type ElemTy. If they are not, std::nullopt is returned.
/// Otherwise, if \p Inst is not specified, just initialized optional value is
/// returned to show that the pointers represent strided pointers. If \p Inst
/// specified, the runtime stride is materialized before the given \p Inst.
/// \returns std::nullopt if the pointers are not pointers with the runtime
/// stride, nullptr or actual stride value, otherwise.
static std::optional<Value *>
calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
const DataLayout &DL, ScalarEvolution &SE,
SmallVectorImpl<unsigned> &SortedIndices,
Instruction *Inst = nullptr) {
SmallVector<const SCEV *> SCEVs;
const SCEV *PtrSCEVLowest = nullptr;
const SCEV *PtrSCEVHighest = nullptr;
// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
// addresses).
for (Value *Ptr : PointerOps) {
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
if (!PtrSCEV)
return std::nullopt;
SCEVs.push_back(PtrSCEV);
if (!PtrSCEVLowest && !PtrSCEVHighest) {
PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
continue;
}
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Diff))
return std::nullopt;
if (Diff->isNonConstantNegative()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is isNonConstant part? Why not isKnownNegative?

PtrSCEVLowest = PtrSCEV;
continue;
}
const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
if (isa<SCEVCouldNotCompute>(Diff1))
return std::nullopt;
if (Diff1->isNonConstantNegative()) {
PtrSCEVHighest = PtrSCEV;
continue;
}
}
// Dist = PtrSCEVHighest - PtrSCEVLowest;
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
if (isa<SCEVCouldNotCompute>(Dist))
return std::nullopt;
int Size = DL.getTypeStoreSize(ElemTy);
auto TryGetStride = [&](const SCEV *Dist,
const SCEV *Multiplier) -> const SCEV * {
if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
if (M->getOperand(0) == Multiplier)
return M->getOperand(1);
if (M->getOperand(1) == Multiplier)
return M->getOperand(0);
return nullptr;
}
if (Multiplier == Dist)
return SE.getConstant(Dist->getType(), 1);
return SE.getUDivExactExpr(Dist, Multiplier);
};
// Stride_in_elements = Dist / element_size * (num_elems - 1).
const SCEV *Stride = nullptr;
if (Size != 1 || SCEVs.size() > 2) {
const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
Stride = TryGetStride(Dist, Sz);
if (!Stride)
return std::nullopt;
}
if (!Stride || isa<SCEVConstant>(Stride))
return std::nullopt;
// Iterate through all pointers and check if all distances are
// unique multiple of Stride.
using DistOrdPair = std::pair<int64_t, int>;
auto Compare = llvm::less_first();
std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
int Cnt = 0;
bool IsConsecutive = true;
for (const SCEV *PtrSCEV : SCEVs) {
unsigned Dist = 0;
if (PtrSCEV != PtrSCEVLowest) {
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
const SCEV *Coeff = TryGetStride(Diff, Stride);
if (!Coeff)
return std::nullopt;
const auto *SC = dyn_cast<SCEVConstant>(Coeff);
if (!SC || isa<SCEVCouldNotCompute>(SC))
return std::nullopt;
if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
SE.getMulExpr(Stride, SC)))
->isZero())
return std::nullopt;
Dist = SC->getAPInt().getZExtValue();
}
// If the strides are not the same or repeated, we can't vectorize.
if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
return std::nullopt;
auto Res = Offsets.emplace(Dist, Cnt);
if (!Res.second)
return std::nullopt;
// Consecutive order if the inserted element is the last one.
IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
++Cnt;
}
if (Offsets.size() != SCEVs.size())
return std::nullopt;
SortedIndices.clear();
if (!IsConsecutive) {
// Fill SortedIndices array only if it is non-consecutive.
SortedIndices.resize(PointerOps.size());
Cnt = 0;
for (const std::pair<int64_t, int> &Pair : Offsets) {
SortedIndices[Cnt] = Pair.second;
++Cnt;
}
}
if (!Inst)
return nullptr;
SCEVExpander Expander(SE, DL, "strided-load-vec");
return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
}

BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
Expand Down Expand Up @@ -4046,6 +4167,11 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
return LoadsState::StridedVectorize;
if (IsSorted || all_of(PointerOps, [&](Value *P) {
return arePointersCompatible(P, PointerOps.front(), *TLI);
})) {
Expand Down Expand Up @@ -4465,6 +4591,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::nullopt; // No need to reorder.
return std::move(ResOrder);
}
if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
any_of(TE.UserTreeIndices,
[](const EdgeInfo &EI) {
return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
}) &&
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
return std::nullopt;
if ((TE.State == TreeEntry::Vectorize ||
TE.State == TreeEntry::StridedVectorize) &&
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
Expand Down Expand Up @@ -4930,7 +5063,8 @@ bool BoUpSLP::canReorderOperands(
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
return OpData.first == I &&
OpData.second->State == TreeEntry::Vectorize;
(OpData.second->State == TreeEntry::Vectorize ||
OpData.second->State == TreeEntry::StridedVectorize);
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
Expand All @@ -4947,6 +5081,7 @@ bool BoUpSLP::canReorderOperands(
// If there are reused scalars, process this node as a regular vectorize
// node, just reorder reuses mask.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
continue;
Expand All @@ -4955,6 +5090,7 @@ bool BoUpSLP::canReorderOperands(
if (count_if(ReorderableGathers,
[&Gather, UserTE, I](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
"Only non-vectorized nodes are expected.");
if (any_of(TE->UserTreeIndices,
[UserTE, I](const EdgeInfo &EI) {
Expand Down Expand Up @@ -12032,10 +12168,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
std::optional<int> Diff = getPointersDiff(
VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
Type *StrideTy = DL->getIndexType(PO->getType());
int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
Value *StrideVal =
ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
DL->getTypeAllocSize(ScalarTy));
Value *StrideVal;
if (Diff) {
int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
StrideVal =
ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
DL->getTypeAllocSize(ScalarTy));
} else {
SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
transform(E->Scalars, PointerOps.begin(), [](Value *V) {
return cast<LoadInst>(V)->getPointerOperand();
});
OrdersType Order;
std::optional<Value *> Stride =
calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
&*Builder.GetInsertPoint());
Value *NewStride =
Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
StrideVal = Builder.CreateMul(
NewStride,
ConstantInt::get(
StrideTy,
(IsReverseOrder ? -1 : 1) *
static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
}
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
auto *Inst = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_load,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,67 +78,13 @@ define void @test1(ptr %p, ptr noalias %s, i32 %stride) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3
; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4
; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5
; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6
; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7
; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[STR]], 4
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 [[TMP0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP1]]
; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
;
entry:
Expand Down Expand Up @@ -215,38 +161,12 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3
; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2
; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
; CHECK-NEXT: store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STR]], -4
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 [[TMP1]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
;
entry:
Expand Down