Skip to content

Commit b0a0df9

Browse files
committed
[SLP]Fix vectorization of the alternate cmp instruction with swapped predicates.
If the alternate cmp instruction is a swapped predicate of the main cmp instruction, need to generate alternate instruction, not the one with the swapped predicate. Also, the lane with the alternate opcode should be selected only, if the corresponding operands are not compatible. Correctness confirmed: https://alive2.llvm.org/ce/z/94BG66 Differential Revision: https://reviews.llvm.org/D119855
1 parent f274230 commit b0a0df9

File tree

4 files changed

+143
-115
lines changed

4 files changed

+143
-115
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 30 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -4544,10 +4544,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
45444544
Value *RHS = Cmp->getOperand(1);
45454545
CmpInst::Predicate CurrentPred = Cmp->getPredicate();
45464546
if (P0 == AltP0Swapped) {
4547-
if ((P0 == CurrentPred &&
4548-
!areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
4549-
(AltP0 == CurrentPred &&
4550-
areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)))
4547+
if (CI != Cmp && S.AltOp != Cmp &&
4548+
((P0 == CurrentPred &&
4549+
!areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
4550+
(AltP0 == CurrentPred &&
4551+
areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS))))
45514552
std::swap(LHS, RHS);
45524553
} else if (P0 != CurrentPred && AltP0 != CurrentPred) {
45534554
std::swap(LHS, RHS);
@@ -4835,6 +4836,29 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
48354836
}
48364837
}
48374838

4839+
/// Checks if the specified instruction \p I is an alternate operation for the
4840+
/// given \p MainOp and \p AltOp instructions.
4841+
static bool isAlternateInstruction(const Instruction *I,
4842+
const Instruction *MainOp,
4843+
const Instruction *AltOp) {
4844+
if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) {
4845+
auto *AltCI0 = cast<CmpInst>(AltOp);
4846+
auto *CI = cast<CmpInst>(I);
4847+
CmpInst::Predicate P0 = CI0->getPredicate();
4848+
CmpInst::Predicate AltP0 = AltCI0->getPredicate();
4849+
assert(P0 != AltP0 && "Expected different main/alternate predicates.");
4850+
CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
4851+
CmpInst::Predicate CurrentPred = CI->getPredicate();
4852+
if (P0 == AltP0Swapped)
4853+
return I == AltCI0 ||
4854+
(I != MainOp &&
4855+
!areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
4856+
CI->getOperand(0), CI->getOperand(1)));
4857+
return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
4858+
}
4859+
return I->getOpcode() == AltOp->getOpcode();
4860+
}
4861+
48384862
InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
48394863
ArrayRef<Value *> VectorizedVals) {
48404864
ArrayRef<Value*> VL = E->Scalars;
@@ -5560,28 +5584,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
55605584
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
55615585
[E](Instruction *I) {
55625586
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
5563-
if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
5564-
auto *AltCI0 = cast<CmpInst>(E->getAltOp());
5565-
auto *CI = cast<CmpInst>(I);
5566-
CmpInst::Predicate P0 = CI0->getPredicate();
5567-
CmpInst::Predicate AltP0 = AltCI0->getPredicate();
5568-
assert(P0 != AltP0 &&
5569-
"Expected different main/alternate predicates.");
5570-
CmpInst::Predicate AltP0Swapped =
5571-
CmpInst::getSwappedPredicate(AltP0);
5572-
CmpInst::Predicate CurrentPred = CI->getPredicate();
5573-
if (P0 == AltP0Swapped)
5574-
return (P0 == CurrentPred &&
5575-
!areCompatibleCmpOps(
5576-
CI0->getOperand(0), CI0->getOperand(1),
5577-
CI->getOperand(0), CI->getOperand(1))) ||
5578-
(AltP0 == CurrentPred &&
5579-
!areCompatibleCmpOps(
5580-
CI0->getOperand(0), CI0->getOperand(1),
5581-
CI->getOperand(1), CI->getOperand(0)));
5582-
return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
5583-
}
5584-
return I->getOpcode() == E->getAltOpcode();
5587+
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
55855588
},
55865589
Mask);
55875590
CommonCost =
@@ -7081,10 +7084,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
70817084
V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
70827085
auto *AltCI = cast<CmpInst>(E->getAltOp());
70837086
CmpInst::Predicate AltPred = AltCI->getPredicate();
7084-
unsigned AltIdx =
7085-
std::distance(E->Scalars.begin(), find(E->Scalars, AltCI));
7086-
if (AltCI->getOperand(0) != E->getOperand(0)[AltIdx])
7087-
AltPred = CmpInst::getSwappedPredicate(AltPred);
70887087
V1 = Builder.CreateCmp(AltPred, LHS, RHS);
70897088
} else {
70907089
V0 = Builder.CreateCast(
@@ -7110,28 +7109,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
71107109
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
71117110
[E](Instruction *I) {
71127111
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
7113-
if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
7114-
auto *AltCI0 = cast<CmpInst>(E->getAltOp());
7115-
auto *CI = cast<CmpInst>(I);
7116-
CmpInst::Predicate P0 = CI0->getPredicate();
7117-
CmpInst::Predicate AltP0 = AltCI0->getPredicate();
7118-
assert(P0 != AltP0 &&
7119-
"Expected different main/alternate predicates.");
7120-
CmpInst::Predicate AltP0Swapped =
7121-
CmpInst::getSwappedPredicate(AltP0);
7122-
CmpInst::Predicate CurrentPred = CI->getPredicate();
7123-
if (P0 == AltP0Swapped)
7124-
return (P0 == CurrentPred &&
7125-
!areCompatibleCmpOps(
7126-
CI0->getOperand(0), CI0->getOperand(1),
7127-
CI->getOperand(0), CI->getOperand(1))) ||
7128-
(AltP0 == CurrentPred &&
7129-
!areCompatibleCmpOps(
7130-
CI0->getOperand(0), CI0->getOperand(1),
7131-
CI->getOperand(1), CI->getOperand(0)));
7132-
return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
7133-
}
7134-
return I->getOpcode() == E->getAltOpcode();
7112+
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
71357113
},
71367114
Mask, &OpScalars, &AltScalars);
71377115

llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,16 @@ define i16 @test(i16 %call37) {
55
; CHECK-LABEL: @test(
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[CALL:%.*]] = load i16, i16* undef, align 2
8-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> <i16 poison, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 [[CALL]], i32 0
9-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 0, i16 0, i16 poison, i16 poison>, i16 [[CALL37:%.*]], i32 3
10-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 4, i32 3, i32 5>
11-
; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP0]], [[SHUFFLE]]
12-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
13-
; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i1> [[TMP3]] to <8 x i16>
14-
; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP4]])
15-
; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i16 [[TMP5]], 0
8+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> <i16 poison, i16 0, i16 0, i16 0, i16 poison, i16 0, i16 0, i16 0>, i16 [[CALL37:%.*]], i32 4
9+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 0
10+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 0, i16 0, i16 poison, i16 0>, i16 [[CALL37]], i32 3
11+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[CALL37]], i32 6
12+
; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP1]], [[TMP3]]
13+
; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i16> [[TMP1]], [[TMP3]]
14+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 5, i32 14, i32 7>
15+
; CHECK-NEXT: [[TMP7:%.*]] = zext <8 x i1> [[TMP6]] to <8 x i16>
16+
; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP7]])
17+
; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i16 [[TMP8]], 0
1618
; CHECK-NEXT: ret i16 [[OP_EXTRA]]
1719
;
1820
entry:

llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -46,20 +46,21 @@ define { <2 x float>, <2 x float> } @test1(i32 %conv.i32.i.i.i) {
4646
; CHECK-LABEL: @test1(
4747
; CHECK-NEXT: entry:
4848
; CHECK-NEXT: [[CONV_I32_I_I_I1:%.*]] = fptosi float 0.000000e+00 to i32
49-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[CONV_I32_I_I_I:%.*]], i32 0
50-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[CONV_I32_I_I_I1]], i32 2
51-
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP0]], [[TMP1]]
52-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
53-
; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> zeroinitializer, <4 x float> zeroinitializer
54-
; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
55-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
56-
; CHECK-NEXT: [[RETVAL_SROA_0_0_VEC_INSERT4:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP6]], i64 0
57-
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
58-
; CHECK-NEXT: [[RETVAL_SROA_0_4_VEC_INSERT7:%.*]] = insertelement <2 x float> [[RETVAL_SROA_0_0_VEC_INSERT4]], float [[TMP7]], i64 1
59-
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
60-
; CHECK-NEXT: [[RETVAL_SROA_7_8_VEC_INSERT11:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP8]], i64 0
61-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
62-
; CHECK-NEXT: [[RETVAL_SROA_7_12_VEC_INSERT13:%.*]] = insertelement <2 x float> [[RETVAL_SROA_7_8_VEC_INSERT11]], float [[TMP9]], i64 1
49+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, i32 [[CONV_I32_I_I_I:%.*]], i32 0
50+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[CONV_I32_I_I_I1]], i32 2
51+
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
52+
; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
53+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
54+
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x float> zeroinitializer, <4 x float> zeroinitializer
55+
; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], zeroinitializer
56+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP6]], i32 0
57+
; CHECK-NEXT: [[RETVAL_SROA_0_0_VEC_INSERT4:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP7]], i64 0
58+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
59+
; CHECK-NEXT: [[RETVAL_SROA_0_4_VEC_INSERT7:%.*]] = insertelement <2 x float> [[RETVAL_SROA_0_0_VEC_INSERT4]], float [[TMP8]], i64 1
60+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
61+
; CHECK-NEXT: [[RETVAL_SROA_7_8_VEC_INSERT11:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP9]], i64 0
62+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
63+
; CHECK-NEXT: [[RETVAL_SROA_7_12_VEC_INSERT13:%.*]] = insertelement <2 x float> [[RETVAL_SROA_7_8_VEC_INSERT11]], float [[TMP10]], i64 1
6364
; CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } zeroinitializer, <2 x float> [[RETVAL_SROA_0_4_VEC_INSERT7]], 0
6465
; CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[RETVAL_SROA_7_12_VEC_INSERT13]], 1
6566
; CHECK-NEXT: ret { <2 x float>, <2 x float> } zeroinitializer

llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -251,28 +251,53 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
251251
}
252252

253253
define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
254-
; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
255-
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
256-
; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
257-
; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
258-
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
259-
; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42
260-
; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42
261-
; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
262-
; CHECK-NEXT: call void @use1(i1 [[C2]])
263-
; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42
264-
; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17
265-
; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
266-
; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
267-
; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
268-
; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
269-
; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
270-
; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
271-
; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
272-
; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
273-
; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
274-
; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
275-
; CHECK-NEXT: ret i1 [[S7]]
254+
; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
255+
; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
256+
; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
257+
; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
258+
; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
259+
; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
260+
; SSE-NEXT: call void @use1(i1 [[C2]])
261+
; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
262+
; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
263+
; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X3]], i32 2
264+
; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
265+
; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
266+
; SSE-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
267+
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
268+
; SSE-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
269+
; SSE-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
270+
; SSE-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
271+
; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
272+
; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
273+
; SSE-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i1 [[C2]], i1 false
274+
; SSE-NEXT: [[S5:%.*]] = select i1 [[OP_EXTRA]], i1 [[D1]], i1 false
275+
; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
276+
; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
277+
; SSE-NEXT: ret i1 [[S7]]
278+
;
279+
; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
280+
; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
281+
; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
282+
; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
283+
; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
284+
; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42
285+
; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42
286+
; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
287+
; AVX-NEXT: call void @use1(i1 [[C2]])
288+
; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42
289+
; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17
290+
; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
291+
; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
292+
; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
293+
; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
294+
; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
295+
; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
296+
; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
297+
; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
298+
; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
299+
; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
300+
; AVX-NEXT: ret i1 [[S7]]
276301
;
277302
%x0 = extractelement <4 x i32> %x, i32 0
278303
%x1 = extractelement <4 x i32> %x, i32 1
@@ -395,25 +420,47 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
395420
}
396421

397422
define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
398-
; CHECK-LABEL: @logical_and_icmp_clamp_partial(
399-
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
400-
; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
401-
; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
402-
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
403-
; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42
404-
; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42
405-
; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
406-
; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17
407-
; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
408-
; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
409-
; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
410-
; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
411-
; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
412-
; CHECK-NEXT: [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false
413-
; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
414-
; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
415-
; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
416-
; CHECK-NEXT: ret i1 [[S7]]
423+
; SSE-LABEL: @logical_and_icmp_clamp_partial(
424+
; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
425+
; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
426+
; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
427+
; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
428+
; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
429+
; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
430+
; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2
431+
; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
432+
; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
433+
; SSE-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
434+
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
435+
; SSE-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
436+
; SSE-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
437+
; SSE-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
438+
; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
439+
; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
440+
; SSE-NEXT: [[S5:%.*]] = select i1 [[TMP9]], i1 [[D1]], i1 false
441+
; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
442+
; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
443+
; SSE-NEXT: ret i1 [[S7]]
444+
;
445+
; AVX-LABEL: @logical_and_icmp_clamp_partial(
446+
; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
447+
; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
448+
; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
449+
; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
450+
; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42
451+
; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42
452+
; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
453+
; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17
454+
; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
455+
; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
456+
; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
457+
; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
458+
; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
459+
; AVX-NEXT: [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false
460+
; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
461+
; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
462+
; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
463+
; AVX-NEXT: ret i1 [[S7]]
417464
;
418465
%x0 = extractelement <4 x i32> %x, i32 0
419466
%x1 = extractelement <4 x i32> %x, i32 1

0 commit comments

Comments
 (0)