Skip to content

Commit 1c2b79a

Browse files
[SLP]Add runtime stride support for strided loads.
Added support for runtime strides. Reviewers: preames, RKSimon Reviewed By: preames Pull Request: #81517
1 parent f3be842 commit 1c2b79a

File tree

2 files changed

+170
-94
lines changed

2 files changed

+170
-94
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 161 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
8989
#include "llvm/Transforms/Utils/Local.h"
9090
#include "llvm/Transforms/Utils/LoopUtils.h"
91+
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
9192
#include <algorithm>
9293
#include <cassert>
9394
#include <cstdint>
@@ -4014,6 +4015,126 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
40144015
});
40154016
}
40164017

4018+
/// Checks if the provided list of pointers \p Pointers represents the strided
4019+
/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4020+
/// Otherwise, if \p Inst is not specified, just initialized optional value is
4021+
/// returned to show that the pointers represent strided pointers. If \p Inst
4022+
/// specified, the runtime stride is materialized before the given \p Inst.
4023+
/// \returns std::nullopt if the pointers are not pointers with the runtime
4024+
/// stride, nullptr or actual stride value, otherwise.
4025+
static std::optional<Value *>
4026+
calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
4027+
const DataLayout &DL, ScalarEvolution &SE,
4028+
SmallVectorImpl<unsigned> &SortedIndices,
4029+
Instruction *Inst = nullptr) {
4030+
SmallVector<const SCEV *> SCEVs;
4031+
const SCEV *PtrSCEVLowest = nullptr;
4032+
const SCEV *PtrSCEVHighest = nullptr;
4033+
// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4034+
// addresses).
4035+
for (Value *Ptr : PointerOps) {
4036+
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4037+
if (!PtrSCEV)
4038+
return std::nullopt;
4039+
SCEVs.push_back(PtrSCEV);
4040+
if (!PtrSCEVLowest && !PtrSCEVHighest) {
4041+
PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4042+
continue;
4043+
}
4044+
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4045+
if (isa<SCEVCouldNotCompute>(Diff))
4046+
return std::nullopt;
4047+
if (Diff->isNonConstantNegative()) {
4048+
PtrSCEVLowest = PtrSCEV;
4049+
continue;
4050+
}
4051+
const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4052+
if (isa<SCEVCouldNotCompute>(Diff1))
4053+
return std::nullopt;
4054+
if (Diff1->isNonConstantNegative()) {
4055+
PtrSCEVHighest = PtrSCEV;
4056+
continue;
4057+
}
4058+
}
4059+
// Dist = PtrSCEVHighest - PtrSCEVLowest;
4060+
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4061+
if (isa<SCEVCouldNotCompute>(Dist))
4062+
return std::nullopt;
4063+
int Size = DL.getTypeStoreSize(ElemTy);
4064+
auto TryGetStride = [&](const SCEV *Dist,
4065+
const SCEV *Multiplier) -> const SCEV * {
4066+
if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4067+
if (M->getOperand(0) == Multiplier)
4068+
return M->getOperand(1);
4069+
if (M->getOperand(1) == Multiplier)
4070+
return M->getOperand(0);
4071+
return nullptr;
4072+
}
4073+
if (Multiplier == Dist)
4074+
return SE.getConstant(Dist->getType(), 1);
4075+
return SE.getUDivExactExpr(Dist, Multiplier);
4076+
};
4077+
// Stride_in_elements = Dist / element_size * (num_elems - 1).
4078+
const SCEV *Stride = nullptr;
4079+
if (Size != 1 || SCEVs.size() > 2) {
4080+
const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4081+
Stride = TryGetStride(Dist, Sz);
4082+
if (!Stride)
4083+
return std::nullopt;
4084+
}
4085+
if (!Stride || isa<SCEVConstant>(Stride))
4086+
return std::nullopt;
4087+
// Iterate through all pointers and check if all distances are
4088+
// unique multiple of Stride.
4089+
using DistOrdPair = std::pair<int64_t, int>;
4090+
auto Compare = llvm::less_first();
4091+
std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4092+
int Cnt = 0;
4093+
bool IsConsecutive = true;
4094+
for (const SCEV *PtrSCEV : SCEVs) {
4095+
unsigned Dist = 0;
4096+
if (PtrSCEV != PtrSCEVLowest) {
4097+
const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4098+
const SCEV *Coeff = TryGetStride(Diff, Stride);
4099+
if (!Coeff)
4100+
return std::nullopt;
4101+
const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4102+
if (!SC || isa<SCEVCouldNotCompute>(SC))
4103+
return std::nullopt;
4104+
if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4105+
SE.getMulExpr(Stride, SC)))
4106+
->isZero())
4107+
return std::nullopt;
4108+
Dist = SC->getAPInt().getZExtValue();
4109+
}
4110+
// If the strides are not the same or repeated, we can't vectorize.
4111+
if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4112+
return std::nullopt;
4113+
auto Res = Offsets.emplace(Dist, Cnt);
4114+
if (!Res.second)
4115+
return std::nullopt;
4116+
// Consecutive order if the inserted element is the last one.
4117+
IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4118+
++Cnt;
4119+
}
4120+
if (Offsets.size() != SCEVs.size())
4121+
return std::nullopt;
4122+
SortedIndices.clear();
4123+
if (!IsConsecutive) {
4124+
// Fill SortedIndices array only if it is non-consecutive.
4125+
SortedIndices.resize(PointerOps.size());
4126+
Cnt = 0;
4127+
for (const std::pair<int64_t, int> &Pair : Offsets) {
4128+
SortedIndices[Cnt] = Pair.second;
4129+
++Cnt;
4130+
}
4131+
}
4132+
if (!Inst)
4133+
return nullptr;
4134+
SCEVExpander Expander(SE, DL, "strided-load-vec");
4135+
return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4136+
}
4137+
40174138
BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
40184139
ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
40194140
SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
@@ -4046,6 +4167,11 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
40464167
auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
40474168
// Check the order of pointer operands or that all pointers are the same.
40484169
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4170+
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4171+
if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4172+
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4173+
calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4174+
return LoadsState::StridedVectorize;
40494175
if (IsSorted || all_of(PointerOps, [&](Value *P) {
40504176
return arePointersCompatible(P, PointerOps.front(), *TLI);
40514177
})) {
@@ -4465,6 +4591,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
44654591
return std::nullopt; // No need to reorder.
44664592
return std::move(ResOrder);
44674593
}
4594+
if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4595+
any_of(TE.UserTreeIndices,
4596+
[](const EdgeInfo &EI) {
4597+
return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4598+
}) &&
4599+
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4600+
return std::nullopt;
44684601
if ((TE.State == TreeEntry::Vectorize ||
44694602
TE.State == TreeEntry::StridedVectorize) &&
44704603
(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
@@ -4930,7 +5063,8 @@ bool BoUpSLP::canReorderOperands(
49305063
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
49315064
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
49325065
return OpData.first == I &&
4933-
OpData.second->State == TreeEntry::Vectorize;
5066+
(OpData.second->State == TreeEntry::Vectorize ||
5067+
OpData.second->State == TreeEntry::StridedVectorize);
49345068
}))
49355069
continue;
49365070
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
@@ -4947,6 +5081,7 @@ bool BoUpSLP::canReorderOperands(
49475081
// If there are reused scalars, process this node as a regular vectorize
49485082
// node, just reorder reuses mask.
49495083
if (TE->State != TreeEntry::Vectorize &&
5084+
TE->State != TreeEntry::StridedVectorize &&
49505085
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
49515086
GatherOps.push_back(TE);
49525087
continue;
@@ -4955,6 +5090,7 @@ bool BoUpSLP::canReorderOperands(
49555090
if (count_if(ReorderableGathers,
49565091
[&Gather, UserTE, I](TreeEntry *TE) {
49575092
assert(TE->State != TreeEntry::Vectorize &&
5093+
TE->State != TreeEntry::StridedVectorize &&
49585094
"Only non-vectorized nodes are expected.");
49595095
if (any_of(TE->UserTreeIndices,
49605096
[UserTE, I](const EdgeInfo &EI) {
@@ -12032,10 +12168,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1203212168
std::optional<int> Diff = getPointersDiff(
1203312169
VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
1203412170
Type *StrideTy = DL->getIndexType(PO->getType());
12035-
int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
12036-
Value *StrideVal =
12037-
ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12038-
DL->getTypeAllocSize(ScalarTy));
12171+
Value *StrideVal;
12172+
if (Diff) {
12173+
int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
12174+
StrideVal =
12175+
ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12176+
DL->getTypeAllocSize(ScalarTy));
12177+
} else {
12178+
SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
12179+
transform(E->Scalars, PointerOps.begin(), [](Value *V) {
12180+
return cast<LoadInst>(V)->getPointerOperand();
12181+
});
12182+
OrdersType Order;
12183+
std::optional<Value *> Stride =
12184+
calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
12185+
&*Builder.GetInsertPoint());
12186+
Value *NewStride =
12187+
Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
12188+
StrideVal = Builder.CreateMul(
12189+
NewStride,
12190+
ConstantInt::get(
12191+
StrideTy,
12192+
(IsReverseOrder ? -1 : 1) *
12193+
static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
12194+
}
1203912195
Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
1204012196
auto *Inst = Builder.CreateIntrinsic(
1204112197
Intrinsic::experimental_vp_strided_load,

llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll

Lines changed: 9 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -78,67 +78,13 @@ define void @test1(ptr %p, ptr noalias %s, i32 %stride) {
7878
; CHECK-NEXT: entry:
7979
; CHECK-NEXT: [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
8080
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
81-
; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4
8281
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
83-
; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
84-
; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]]
8582
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
86-
; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4
87-
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
88-
; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
89-
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26
90-
; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
91-
; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]]
92-
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1
93-
; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4
94-
; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2
95-
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
96-
; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
97-
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22
98-
; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
99-
; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]]
100-
; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2
101-
; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4
102-
; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3
103-
; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
104-
; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
105-
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18
106-
; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
107-
; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]]
108-
; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3
109-
; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4
110-
; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4
111-
; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
112-
; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4
113-
; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14
114-
; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
115-
; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]]
116-
; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4
117-
; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4
118-
; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5
119-
; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
120-
; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4
121-
; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10
122-
; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
123-
; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]]
124-
; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5
125-
; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4
126-
; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6
127-
; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
128-
; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4
129-
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6
130-
; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
131-
; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]]
132-
; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6
133-
; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4
134-
; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7
135-
; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
136-
; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4
137-
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2
138-
; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
139-
; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]]
140-
; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7
141-
; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4
83+
; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[STR]], 4
84+
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 [[TMP0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
85+
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
86+
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP1]]
87+
; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
14288
; CHECK-NEXT: ret void
14389
;
14490
entry:
@@ -215,38 +161,12 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
215161
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
216162
; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7
217163
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
218-
; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
219164
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
220-
; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6
221-
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]]
222-
; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
223-
; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5
224-
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]]
225-
; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
226-
; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4
227-
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]]
228-
; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
229-
; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3
230-
; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]]
231-
; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4
232-
; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2
233-
; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]]
234-
; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
235-
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]]
236-
; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4
237-
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0
238-
; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
239165
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
240-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0
241-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1
242-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2
243-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3
244-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4
245-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5
246-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6
247-
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7
248-
; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]]
249-
; CHECK-NEXT: store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4
166+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STR]], -4
167+
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 [[TMP1]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
168+
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
169+
; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
250170
; CHECK-NEXT: ret void
251171
;
252172
entry:

0 commit comments

Comments
 (0)