Skip to content

Commit 8217c2e

Browse files
authored
[VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops as well (#120075)
Extend binary instructions matching to match compare instructions + predicate as well.
1 parent 9919295 commit 8217c2e

File tree

2 files changed

+169
-64
lines changed

2 files changed

+169
-64
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,39 +1628,44 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
16281628
}
16291629

16301630
/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
1631-
/// TODO: Handle "shuffle (cmp), (cmp)" into "cmp (shuffle), (shuffle)".
1631+
/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
16321632
bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
16331633
ArrayRef<int> OldMask;
16341634
Instruction *LHS, *RHS;
16351635
if (!match(&I, m_Shuffle(m_OneUse(m_Instruction(LHS)),
16361636
m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
16371637
return false;
16381638

1639-
BinaryOperator *B0, *B1;
1640-
if (!match(LHS, m_BinOp(B0)) || !match(RHS, m_BinOp(B1)))
1641-
return false;
1642-
1643-
// Don't introduce poison into div/rem.
1644-
if (llvm::is_contained(OldMask, PoisonMaskElem) && B0->isIntDivRem())
1639+
// TODO: Add support for addlike etc.
1640+
if (LHS->getOpcode() != RHS->getOpcode())
16451641
return false;
16461642

1647-
// TODO: Add support for addlike etc.
1648-
Instruction::BinaryOps Opcode = B0->getOpcode();
1649-
if (Opcode != B1->getOpcode())
1643+
Value *X, *Y, *Z, *W;
1644+
bool IsCommutative = false;
1645+
CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1646+
if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
1647+
match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
1648+
auto *BO = cast<BinaryOperator>(LHS);
1649+
// Don't introduce poison into div/rem.
1650+
if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
1651+
return false;
1652+
IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
1653+
} else if (match(LHS, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
1654+
match(RHS, m_SpecificCmp(Pred, m_Value(Z), m_Value(W)))) {
1655+
IsCommutative = cast<CmpInst>(LHS)->isCommutative();
1656+
} else
16501657
return false;
16511658

16521659
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
1653-
auto *BinOpTy = dyn_cast<FixedVectorType>(LHS->getType());
1654-
if (!ShuffleDstTy || !BinOpTy)
1660+
auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
1661+
auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
1662+
if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
16551663
return false;
16561664

16571665
unsigned NumSrcElts = BinOpTy->getNumElements();
16581666

16591667
// If we have something like "add X, Y" and "add Z, X", swap ops to match.
1660-
Value *X = LHS->getOperand(0), *Y = LHS->getOperand(1);
1661-
Value *Z = RHS->getOperand(0), *W = RHS->getOperand(1);
1662-
if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
1663-
(X == W || Y == Z))
1668+
if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
16641669
std::swap(X, Y);
16651670

16661671
auto ConvertToUnary = [NumSrcElts](int &M) {
@@ -1688,13 +1693,22 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
16881693
InstructionCost OldCost =
16891694
TTI.getInstructionCost(LHS, CostKind) +
16901695
TTI.getInstructionCost(RHS, CostKind) +
1691-
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
1696+
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
16921697
OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
16931698

16941699
InstructionCost NewCost =
16951700
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
1696-
TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) +
1697-
TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
1701+
TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
1702+
1703+
if (Pred == CmpInst::BAD_ICMP_PREDICATE) {
1704+
NewCost +=
1705+
TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
1706+
} else {
1707+
auto *ShuffleCmpTy =
1708+
FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
1709+
NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
1710+
ShuffleDstTy, Pred, CostKind);
1711+
}
16981712

16991713
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
17001714
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -1704,7 +1718,10 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
17041718

17051719
Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
17061720
Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
1707-
Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
1721+
Value *NewBO = Pred == CmpInst::BAD_ICMP_PREDICATE
1722+
? Builder.CreateBinOp(
1723+
cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
1724+
: Builder.CreateCmp(Pred, Shuf0, Shuf1);
17081725

17091726
// Intersect flags from the old binops.
17101727
if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {

llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll

Lines changed: 132 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,37 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK
3-
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK
4-
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK
5-
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK
2+
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3+
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
4+
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
5+
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
66

77
declare void @use(<4 x i1>)
88

99
; icmp - eq v4i32 is cheap
1010

1111
define <4 x i32> @shuf_icmp_eq_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) {
12-
; CHECK-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
13-
; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
14-
; CHECK-NEXT: [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
15-
; CHECK-NEXT: [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
16-
; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
17-
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
18-
; CHECK-NEXT: ret <4 x i32> [[R]]
12+
; SSE-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
13+
; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
14+
; SSE-NEXT: [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
15+
; SSE-NEXT: [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
16+
; SSE-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
17+
; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
18+
; SSE-NEXT: ret <4 x i32> [[R]]
19+
;
20+
; AVX2-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
21+
; AVX2-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
22+
; AVX2-NEXT: [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
23+
; AVX2-NEXT: [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
24+
; AVX2-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
25+
; AVX2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
26+
; AVX2-NEXT: ret <4 x i32> [[R]]
27+
;
28+
; AVX512-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
29+
; AVX512-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
30+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
31+
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
32+
; AVX512-NEXT: [[S:%.*]] = icmp eq <4 x i32> [[TMP1]], [[TMP2]]
33+
; AVX512-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
34+
; AVX512-NEXT: ret <4 x i32> [[R]]
1935
;
2036
%c0 = icmp eq <4 x i32> %x, %y
2137
%c1 = icmp eq <4 x i32> %z, %w
@@ -27,13 +43,37 @@ define <4 x i32> @shuf_icmp_eq_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <
2743
; icmp - eq v2i64 is only cheap on SSE4+ targets with PCMPEQQ
2844

2945
define <2 x i64> @shuf_icmp_eq_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z, <2 x i64> %w) {
30-
; CHECK-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
31-
; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
32-
; CHECK-NEXT: [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
33-
; CHECK-NEXT: [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
34-
; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
35-
; CHECK-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
36-
; CHECK-NEXT: ret <2 x i64> [[R]]
46+
; SSE2-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
47+
; SSE2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
48+
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3>
49+
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3>
50+
; SSE2-NEXT: [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
51+
; SSE2-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
52+
; SSE2-NEXT: ret <2 x i64> [[R]]
53+
;
54+
; SSE4-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
55+
; SSE4-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
56+
; SSE4-NEXT: [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
57+
; SSE4-NEXT: [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
58+
; SSE4-NEXT: [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
59+
; SSE4-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
60+
; SSE4-NEXT: ret <2 x i64> [[R]]
61+
;
62+
; AVX2-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
63+
; AVX2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
64+
; AVX2-NEXT: [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
65+
; AVX2-NEXT: [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
66+
; AVX2-NEXT: [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
67+
; AVX2-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
68+
; AVX2-NEXT: ret <2 x i64> [[R]]
69+
;
70+
; AVX512-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
71+
; AVX512-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
72+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3>
73+
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3>
74+
; AVX512-NEXT: [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
75+
; AVX512-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
76+
; AVX512-NEXT: ret <2 x i64> [[R]]
3777
;
3878
%c0 = icmp eq <2 x i64> %x, %y
3979
%c1 = icmp eq <2 x i64> %z, %w
@@ -46,10 +86,10 @@ define <2 x i64> @shuf_icmp_eq_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z, <
4686

4787
define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) {
4888
; CHECK-LABEL: define <4 x i32> @shuf_icmp_ugt_v4i32(
49-
; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0]] {
50-
; CHECK-NEXT: [[C0:%.*]] = icmp ugt <4 x i32> [[X]], [[Y]]
51-
; CHECK-NEXT: [[C1:%.*]] = icmp ugt <4 x i32> [[Z]], [[W]]
52-
; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
89+
; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
90+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
91+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
92+
; CHECK-NEXT: [[S:%.*]] = icmp ugt <4 x i32> [[TMP1]], [[TMP2]]
5393
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
5494
; CHECK-NEXT: ret <4 x i32> [[R]]
5595
;
@@ -60,16 +100,32 @@ define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z,
60100
ret <4 x i32> %r
61101
}
62102

63-
; Common operand is op0 of the fcmps.
103+
; Common operand is op0 of the fcmps (CMPPS cheaper on SSE4+).
64104

65105
define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
66-
; CHECK-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
67-
; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
68-
; CHECK-NEXT: [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
69-
; CHECK-NEXT: [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
70-
; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
71-
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
72-
; CHECK-NEXT: ret <4 x i32> [[R]]
106+
; SSE2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
107+
; SSE2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
108+
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
109+
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
110+
; SSE2-NEXT: [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
111+
; SSE2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
112+
; SSE2-NEXT: ret <4 x i32> [[R]]
113+
;
114+
; SSE4-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
115+
; SSE4-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
116+
; SSE4-NEXT: [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
117+
; SSE4-NEXT: [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
118+
; SSE4-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
119+
; SSE4-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
120+
; SSE4-NEXT: ret <4 x i32> [[R]]
121+
;
122+
; AVX-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
123+
; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
124+
; AVX-NEXT: [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
125+
; AVX-NEXT: [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
126+
; AVX-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
127+
; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
128+
; AVX-NEXT: ret <4 x i32> [[R]]
73129
;
74130
%b0 = fcmp oeq <4 x float> %x, %y
75131
%b1 = fcmp oeq <4 x float> %x, %z
@@ -81,13 +137,29 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
81137
; For commutative instructions, common operand may be swapped
82138

83139
define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
84-
; CHECK-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
85-
; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
86-
; CHECK-NEXT: [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
87-
; CHECK-NEXT: [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
88-
; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
89-
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
90-
; CHECK-NEXT: ret <4 x i32> [[R]]
140+
; SSE-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
141+
; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
142+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
143+
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
144+
; SSE-NEXT: [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
145+
; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
146+
; SSE-NEXT: ret <4 x i32> [[R]]
147+
;
148+
; AVX2-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
149+
; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
150+
; AVX2-NEXT: [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
151+
; AVX2-NEXT: [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
152+
; AVX2-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
153+
; AVX2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
154+
; AVX2-NEXT: ret <4 x i32> [[R]]
155+
;
156+
; AVX512-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
157+
; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
158+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
159+
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
160+
; AVX512-NEXT: [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
161+
; AVX512-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
162+
; AVX512-NEXT: ret <4 x i32> [[R]]
91163
;
92164
%b0 = fcmp one <4 x float> %x, %y
93165
%b1 = fcmp one <4 x float> %z, %x
@@ -99,13 +171,29 @@ define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x
99171
; non-commutative pred, but common op0
100172

101173
define <4 x i32> @shuf_icmp_sgt_v4i32_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
102-
; CHECK-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
103-
; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
104-
; CHECK-NEXT: [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
105-
; CHECK-NEXT: [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
106-
; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
107-
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
108-
; CHECK-NEXT: ret <4 x i32> [[R]]
174+
; SSE-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
175+
; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
176+
; SSE-NEXT: [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
177+
; SSE-NEXT: [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
178+
; SSE-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
179+
; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
180+
; SSE-NEXT: ret <4 x i32> [[R]]
181+
;
182+
; AVX2-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
183+
; AVX2-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
184+
; AVX2-NEXT: [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
185+
; AVX2-NEXT: [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
186+
; AVX2-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
187+
; AVX2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
188+
; AVX2-NEXT: ret <4 x i32> [[R]]
189+
;
190+
; AVX512-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
191+
; AVX512-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
192+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 2>
193+
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
194+
; AVX512-NEXT: [[S:%.*]] = icmp sgt <4 x i32> [[TMP1]], [[TMP2]]
195+
; AVX512-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
196+
; AVX512-NEXT: ret <4 x i32> [[R]]
109197
;
110198
%b0 = icmp sgt <4 x i32> %x, %y
111199
%b1 = icmp sgt <4 x i32> %x, %z

0 commit comments

Comments
 (0)