Skip to content

Commit fcad8d3

Browse files
[SLP] Make SLPVectorizer to use llvm.masked.gather intrinsic
For the scattered operands of load instructions it makes sense to use gathering load intrinsic, which can lower to native instruction for X86/AVX512 and ARM/SVE. This also enables building vectorization tree with entries containing scattered operands. The next step is to add scattered store. Fixes PR47629 and PR47623 Differential Revision: https://reviews.llvm.org/D90445
1 parent b90228e commit fcad8d3

File tree

4 files changed

+205
-87
lines changed

4 files changed

+205
-87
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 68 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,8 +1552,10 @@ class BoUpSLP {
15521552
/// The Scalars are vectorized into this value. It is initialized to Null.
15531553
Value *VectorizedValue = nullptr;
15541554

1555-
/// Do we need to gather this sequence ?
1556-
enum EntryState { Vectorize, NeedToGather };
1555+
/// Do we need to gather this sequence or vectorize it
1556+
/// (either with vector instruction or with scatter/gather
1557+
/// intrinsics for store/load)?
1558+
enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
15571559
EntryState State;
15581560

15591561
/// Does this sequence require some shuffling?
@@ -1701,6 +1703,9 @@ class BoUpSLP {
17011703
case Vectorize:
17021704
dbgs() << "Vectorize\n";
17031705
break;
1706+
case ScatterVectorize:
1707+
dbgs() << "ScatterVectorize\n";
1708+
break;
17041709
case NeedToGather:
17051710
dbgs() << "NeedToGather\n";
17061711
break;
@@ -1745,17 +1750,33 @@ class BoUpSLP {
17451750
const EdgeInfo &UserTreeIdx,
17461751
ArrayRef<unsigned> ReuseShuffleIndices = None,
17471752
ArrayRef<unsigned> ReorderIndices = None) {
1748-
bool Vectorized = (bool)Bundle;
1753+
TreeEntry::EntryState EntryState =
1754+
Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
1755+
return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
1756+
ReuseShuffleIndices, ReorderIndices);
1757+
}
1758+
1759+
TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
1760+
TreeEntry::EntryState EntryState,
1761+
Optional<ScheduleData *> Bundle,
1762+
const InstructionsState &S,
1763+
const EdgeInfo &UserTreeIdx,
1764+
ArrayRef<unsigned> ReuseShuffleIndices = None,
1765+
ArrayRef<unsigned> ReorderIndices = None) {
1766+
assert(!(Bundle && EntryState == TreeEntry::NeedToGather) &&
1767+
"Need to gather vectorized entry?");
1768+
assert(!Bundle && EntryState != TreeEntry::NeedToGather &&
1769+
"Need to vectorize gather entry?");
17491770
VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
17501771
TreeEntry *Last = VectorizableTree.back().get();
17511772
Last->Idx = VectorizableTree.size() - 1;
17521773
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
1753-
Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
1774+
Last->State = EntryState;
17541775
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
17551776
ReuseShuffleIndices.end());
17561777
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
17571778
Last->setOperations(S);
1758-
if (Vectorized) {
1779+
if (Last->State != TreeEntry::NeedToGather) {
17591780
for (Value *V : VL) {
17601781
assert(!getTreeEntry(V) && "Scalar already in tree!");
17611782
ScalarToTreeEntry[V] = Last;
@@ -2841,6 +2862,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
28412862
}
28422863
return;
28432864
}
2865+
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
2866+
TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
2867+
UserTreeIdx, ReuseShuffleIndicies);
2868+
TE->setOperandsInOrder();
2869+
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
2870+
LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
2871+
return;
28442872
}
28452873

28462874
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
@@ -3427,7 +3455,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
34273455
}
34283456
return ReuseShuffleCost + getGatherCost(VL);
34293457
}
3430-
assert(E->State == TreeEntry::Vectorize && "Unhandled state");
3458+
assert((E->State == TreeEntry::Vectorize ||
3459+
E->State == TreeEntry::ScatterVectorize) &&
3460+
"Unhandled state");
34313461
assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
34323462
Instruction *VL0 = E->getMainOp();
34333463
unsigned ShuffleOrOp =
@@ -3682,9 +3712,16 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
36823712
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
36833713
}
36843714
int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
3685-
int VecLdCost =
3686-
TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
3687-
CostKind, VL0);
3715+
int VecLdCost;
3716+
if (E->State == TreeEntry::Vectorize) {
3717+
VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
3718+
CostKind, VL0);
3719+
} else {
3720+
assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
3721+
VecLdCost = TTI->getGatherScatterOpCost(
3722+
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
3723+
/*VariableMask=*/false, alignment, CostKind, VL0);
3724+
}
36883725
if (!E->ReorderIndices.empty()) {
36893726
// TODO: Merge this shuffle with the ReuseShuffleCost.
36903727
VecLdCost += TTI->getShuffleCost(
@@ -4276,7 +4313,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
42764313
return Vec;
42774314
}
42784315

4279-
assert(E->State == TreeEntry::Vectorize && "Unhandled state");
4316+
assert((E->State == TreeEntry::Vectorize ||
4317+
E->State == TreeEntry::ScatterVectorize) &&
4318+
"Unhandled state");
42804319
unsigned ShuffleOrOp =
42814320
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
42824321
Instruction *VL0 = E->getMainOp();
@@ -4505,20 +4544,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
45054544
setInsertPointAfterBundle(E);
45064545

45074546
LoadInst *LI = cast<LoadInst>(VL0);
4547+
Instruction *NewLI;
45084548
unsigned AS = LI->getPointerAddressSpace();
4549+
Value *PO = LI->getPointerOperand();
4550+
if (E->State == TreeEntry::Vectorize) {
45094551

4510-
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
4511-
VecTy->getPointerTo(AS));
4552+
Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
45124553

4513-
// The pointer operand uses an in-tree scalar so we add the new BitCast to
4514-
// ExternalUses list to make sure that an extract will be generated in the
4515-
// future.
4516-
Value *PO = LI->getPointerOperand();
4517-
if (getTreeEntry(PO))
4518-
ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
4554+
// The pointer operand uses an in-tree scalar so we add the new BitCast
4555+
// to ExternalUses list to make sure that an extract will be generated
4556+
// in the future.
4557+
if (getTreeEntry(PO))
4558+
ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
4559+
4560+
NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
4561+
} else {
4562+
assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
4563+
Value *VecPtr = vectorizeTree(E->getOperand(0));
4564+
NewLI = Builder.CreateMaskedGather(VecPtr, LI->getAlign());
4565+
}
4566+
Value *V = propagateMetadata(NewLI, E->Scalars);
45194567

4520-
LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
4521-
Value *V = propagateMetadata(LI, E->Scalars);
45224568
if (IsReorder) {
45234569
SmallVector<int, 4> Mask;
45244570
inversePermutation(E->ReorderIndices, Mask);
@@ -4795,7 +4841,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
47954841
continue;
47964842
TreeEntry *E = getTreeEntry(Scalar);
47974843
assert(E && "Invalid scalar");
4798-
assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");
4844+
assert(E->State != TreeEntry::NeedToGather &&
4845+
"Extracting from a gather list");
47994846

48004847
Value *Vec = E->VectorizedValue;
48014848
assert(Vec && "Can't find vectorizable value");

llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll

Lines changed: 30 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -229,35 +229,34 @@ entry:
229229
define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) {
230230
; CHECK-LABEL: @lookahead_external_uses(
231231
; CHECK-NEXT: entry:
232-
; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
233232
; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
234233
; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
235234
; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
236-
; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
235+
; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
237236
; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
238-
; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
237+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double*> undef, double* [[A]], i32 0
238+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[A]], i32 1
239+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> <i64 0, i64 2>
239240
; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
240-
; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8
241241
; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8
242242
; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8
243243
; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8
244244
; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8
245-
; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8
246-
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
247-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
248-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
249-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1
250-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
251-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1
252-
; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]]
253-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0
254-
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1
255-
; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]]
256-
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
245+
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> undef)
246+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double*> [[TMP2]], i32 0
247+
; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
248+
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
249+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
250+
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A1]], i32 1
251+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
252+
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B2]], i32 1
253+
; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]]
254+
; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP6]]
255+
; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP12]], [[TMP11]]
257256
; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
258257
; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
259-
; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
260-
; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
258+
; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
259+
; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8
261260
; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8
262261
; CHECK-NEXT: ret void
263262
;
@@ -328,31 +327,27 @@ define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, do
328327
; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
329328
; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
330329
; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
330+
; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8
331331
; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8
332332
; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8
333333
; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8
334-
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
335-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
334+
; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8
336335
; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8
337336
; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8
338337
; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8
339-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0
340-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1
341-
; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
342-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
343-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1
344-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
345-
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
346-
; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
347-
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
338+
; CHECK-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
339+
; CHECK-NEXT: [[SUBC0D0:%.*]] = fsub fast double [[C0]], [[D0]]
340+
; CHECK-NEXT: [[SUBA1B2:%.*]] = fsub fast double [[A1]], [[B2]]
341+
; CHECK-NEXT: [[SUBA2B1:%.*]] = fsub fast double [[A2]], [[B1]]
342+
; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[SUBA0B0]], [[SUBC0D0]]
343+
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[SUBA1B2]], [[SUBA2B1]]
348344
; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
349345
; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
350-
; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
351-
; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
352-
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
353-
; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8
354-
; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8
355-
; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8
346+
; CHECK-NEXT: store double [[ADD0]], double* [[IDXS0]], align 8
347+
; CHECK-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8
348+
; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8
349+
; CHECK-NEXT: store double [[A1]], double* [[EXT2:%.*]], align 8
350+
; CHECK-NEXT: store double [[A1]], double* [[EXT3:%.*]], align 8
356351
; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8
357352
; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8
358353
; CHECK-NEXT: ret void

llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,33 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s
3-
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s
4-
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s
5-
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s
6-
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s
2+
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3+
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX
4+
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
5+
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX
6+
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX
77

88

99
@b = global [8 x i32] zeroinitializer, align 16
1010
@a = global [8 x i32] zeroinitializer, align 16
1111

1212
define void @foo() {
13-
; CHECK-LABEL: @foo(
14-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16
15-
; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16
16-
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8
17-
; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 1), align 4
18-
; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 2), align 8
19-
; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 3), align 4
20-
; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 4), align 16
21-
; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 5), align 4
22-
; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 6), align 8
23-
; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 7), align 4
24-
; CHECK-NEXT: ret void
13+
; SSE-LABEL: @foo(
14+
; SSE-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16
15+
; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16
16+
; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8
17+
; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 1), align 4
18+
; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 2), align 8
19+
; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 3), align 4
20+
; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 4), align 16
21+
; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 5), align 4
22+
; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 6), align 8
23+
; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 7), align 4
24+
; SSE-NEXT: ret void
25+
;
26+
; AVX-LABEL: @foo(
27+
; AVX-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> <i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2)>, i32 16, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
28+
; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
29+
; AVX-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16
30+
; AVX-NEXT: ret void
2531
;
2632
%1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16
2733
store i32 %1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16

0 commit comments

Comments
 (0)