Skip to content

Commit 5deb4ef

Browse files
[SLP]Initial non-power-of-2 (but still whole register) for remaining nodes
Added non-power-of-2 (but still whole registers) vectorization support for nodes other than stores and reductions. Reviewers: preames, RKSimon, hiraditya Reviewed By: RKSimon Pull Request: #113356
1 parent a53abb2 commit 5deb4ef

File tree

7 files changed

+97
-94
lines changed

7 files changed

+97
-94
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2656,7 +2656,9 @@ class BoUpSLP {
26562656
}
26572657
// TODO: Check if we can remove a check for non-power-2 number of
26582658
// scalars after full support of non-power-2 vectorization.
2659-
return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2659+
return UniqueValues.size() != 2 &&
2660+
hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
2661+
UniqueValues.size());
26602662
};
26612663

26622664
// If the initial strategy fails for any of the operand indexes, then we
@@ -5101,12 +5103,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
51015103
});
51025104
});
51035105
const unsigned AbsoluteDiff = std::abs(*Diff);
5104-
if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5105-
((Sz > MinProfitableStridedLoads ||
5106-
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5107-
has_single_bit(AbsoluteDiff))) &&
5108-
AbsoluteDiff > Sz) ||
5109-
*Diff == -(static_cast<int>(Sz) - 1))) {
5106+
if (IsPossibleStrided &&
5107+
(IsAnyPointerUsedOutGraph ||
5108+
(AbsoluteDiff > Sz &&
5109+
(Sz > MinProfitableStridedLoads ||
5110+
(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5111+
AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
5112+
*Diff == -(static_cast<int>(Sz) - 1))) {
51105113
int Stride = *Diff / static_cast<int>(Sz - 1);
51115114
if (*Diff == Stride * static_cast<int>(Sz - 1)) {
51125115
Align Alignment =
@@ -5192,17 +5195,20 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
51925195
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
51935196

51945197
// FIXME: The following code has not been updated for non-power-of-2
5195-
// vectors. The splitting logic here does not cover the original
5196-
// vector if the vector factor is not a power of two. FIXME
5197-
if (!has_single_bit(VL.size()))
5198+
// vectors (and not whole registers). The splitting logic here does not
5199+
// cover the original vector if the vector factor is not a power of two.
5200+
if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
51985201
return false;
51995202

52005203
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
52015204
unsigned MinVF = getMinVF(2 * Sz);
52025205
DemandedElts.clearAllBits();
52035206
// Iterate through possible vectorization factors and check if vectorized +
52045207
// shuffles is better than just gather.
5205-
for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
5208+
for (unsigned VF =
5209+
getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
5210+
VF >= MinVF;
5211+
VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
52065212
SmallVector<LoadsState> States;
52075213
for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
52085214
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
@@ -7632,8 +7638,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
76327638
case Instruction::ExtractValue:
76337639
case Instruction::ExtractElement: {
76347640
bool Reuse = canReuseExtract(VL, CurrentOrder);
7635-
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
7636-
if (!has_single_bit(VL.size()))
7641+
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
7642+
// non-full registers).
7643+
if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
76377644
return TreeEntry::NeedToGather;
76387645
if (Reuse || !CurrentOrder.empty())
76397646
return TreeEntry::Vectorize;
@@ -8089,7 +8096,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
80898096
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
80908097
if ((UserTreeIdx.UserTE &&
80918098
UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8092-
!has_single_bit(VL.size())) {
8099+
!hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
80938100
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
80948101
"for nodes with padding.\n");
80958102
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -9840,7 +9847,8 @@ void BoUpSLP::transformNodes() {
98409847
if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
98419848
(S.getOpcode() == Instruction::Load &&
98429849
areKnownNonVectorizableLoads(Slice)) ||
9843-
(S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
9850+
(S.getOpcode() != Instruction::Load &&
9851+
!hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
98449852
continue;
98459853
if (VF == 2) {
98469854
// Try to vectorize reduced values or if all users are vectorized.
@@ -13618,8 +13626,9 @@ BoUpSLP::isGatherShuffledEntry(
1361813626
return !TE->isGather();
1361913627
})))
1362013628
return {};
13621-
// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
13622-
if (TE->isNonPowOf2Vec())
13629+
// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
13630+
// implemented yet.
13631+
if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
1362313632
return {};
1362413633
Mask.assign(VL.size(), PoisonMaskElem);
1362513634
assert((TE->UserTreeIndices.size() == 1 ||
@@ -19200,9 +19209,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1920019209
}
1920119210
}
1920219211

19212+
Type *ScalarTy = getValueType(VL[0]);
1920319213
unsigned Sz = R.getVectorElementSize(I0);
1920419214
unsigned MinVF = R.getMinVF(Sz);
19205-
unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
19215+
unsigned MaxVF = std::max<unsigned>(
19216+
getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
1920619217
MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
1920719218
if (MaxVF < 2) {
1920819219
R.getORE()->emit([&]() {
@@ -19216,10 +19227,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1921619227
bool Changed = false;
1921719228
bool CandidateFound = false;
1921819229
InstructionCost MinCost = SLPCostThreshold.getValue();
19219-
Type *ScalarTy = getValueType(VL[0]);
1922019230

1922119231
unsigned NextInst = 0, MaxInst = VL.size();
19222-
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19232+
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19233+
VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
1922319234
// No actual vectorization should happen, if number of parts is the same as
1922419235
// provided vectorization factor (i.e. the scalar type is used for vector
1922519236
// code during codegen).
@@ -19234,7 +19245,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1923419245

1923519246
if (MaxVFOnly && ActualVF < MaxVF)
1923619247
break;
19237-
if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
19248+
if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
1923819249
break;
1923919250

1924019251
SmallVector<Value *> Ops(ActualVF, nullptr);

llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,14 @@ define i32 @test() {
77
; CHECK-NEXT: br label %[[FUNC_135_EXIT_I:.*]]
88
; CHECK: [[FUNC_135_EXIT_I]]:
99
; CHECK-NEXT: [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
10-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
11-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>
12-
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], <i32 0, i32 0, i32 0, i32 poison>
13-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2>
14-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
15-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 2, i32 2, i32 2, i32 poison>
16-
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
17-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP5]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 poison, i32 28, i32 29, i32 30, i32 poison>
10+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
11+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
12+
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP1]]
13+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3>
14+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1815
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[G_228_PROMOTED166_I1105_I]], i32 7
19-
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 0, i32 15
20-
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP3]], i64 0)
21-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 26, i32 7, i32 28, i32 29, i32 30, i32 31>
16+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v12i32(<16 x i32> poison, <12 x i32> [[TMP3]], i64 0)
17+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 26, i32 7, i32 8, i32 9, i32 10, i32 11>
2218
; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer
2319
; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer
2420
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>

llvm/test/Transforms/SLPVectorizer/X86/gather-loads-non-power-of-2.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,7 @@ define <6 x double> @test(ptr %a) {
55
; CHECK-LABEL: define <6 x double> @test(
66
; CHECK-SAME: ptr [[A:%.*]]) {
77
; CHECK-NEXT: [[ENTRY:.*:]]
8-
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A]], align 8
9-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[A]], i16 4
10-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
11-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
12-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
13-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <6 x double> [[TMP3]], <6 x double> [[TMP4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
8+
; CHECK-NEXT: [[TMP5:%.*]] = load <6 x double>, ptr [[A]], align 8
149
; CHECK-NEXT: ret <6 x double> [[TMP5]]
1510
;
1611
entry:

llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ define void @test(ptr noalias %0, ptr noalias %1) {
99
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
1010
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16
1111
; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8
12-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
13-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
14-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
15-
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 5, i32 1>
12+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
13+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 poison, i32 poison>
14+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP7]], <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 5>
15+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP10]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11>
1616
; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8
1717
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
1818
; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8

llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,10 @@
44
define void @test() {
55
; CHECK-LABEL: define void @test() {
66
; CHECK-NEXT: [[XOR108_I_I_I:%.*]] = xor i64 0, 1
7-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> <i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 2
8-
; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i64> [[TMP1]], zeroinitializer
7+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 10
8+
; CHECK-NEXT: [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer
99
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3
10-
; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> poison, <8 x i64> zeroinitializer, i64 0)
11-
; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v4i64(<16 x i64> [[TMP4]], <4 x i64> [[TMP2]], i64 8)
10+
; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v12i64(<16 x i64> poison, <12 x i64> [[TMP2]], i64 0)
1211
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
1312
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
1413
; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>

llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,23 @@ define void @test(i32 %arg) {
88
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
99
; CHECK-NEXT: br label %[[BB1:.*]]
1010
; CHECK: [[BB1]]:
11-
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ]
12-
; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ]
11+
; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ]
12+
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ]
1313
; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[OP_RDX4:%.*]], %[[BB1]] ]
1414
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB1]] ]
15-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 0>
16-
; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer
15+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0>
1716
; CHECK-NEXT: [[ADD17:%.*]] = add i32 [[PHI]], 0
18-
; CHECK-NEXT: [[ADD18:%.*]] = add i32 [[PHI2]], 0
17+
; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[PHI]], 0
1918
; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[PHI2]], 0
20-
; CHECK-NEXT: [[ADD23:%.*]] = add i32 [[PHI2]], 0
19+
; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[PHI]], 0
20+
; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer
2121
; CHECK-NEXT: [[TMP4]] = add <2 x i32> [[TMP0]], <i32 0, i32 1>
2222
; CHECK-NEXT: [[TMP5]] = extractelement <2 x i32> [[TMP4]], i32 1
2323
; CHECK-NEXT: [[TMP6]] = extractelement <2 x i32> [[TMP4]], i32 0
2424
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP3]])
25-
; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD18]]
26-
; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[ADD17]], [[ADD19]]
27-
; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[ADD23]], [[TMP6]]
25+
; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD17]]
26+
; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[ADD4]], [[ADD6]]
27+
; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[ADD19]], [[TMP6]]
2828
; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i32 [[OP_RDX]], [[OP_RDX1]]
2929
; CHECK-NEXT: [[OP_RDX4]] = xor i32 [[OP_RDX3]], [[OP_RDX2]]
3030
; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[TMP5]], 0

0 commit comments

Comments
 (0)