-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops as well #120075
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…as well Extend binary instructions matching to match compare instructions + predicate as well.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Simon Pilgrim (RKSimon) ChangesExtend binary instructions matching to match compare instructions + predicate as well. Full diff: https://github.com/llvm/llvm-project/pull/120075.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e0304944df3c0c..db77d6c955792c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1628,7 +1628,7 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
}
/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
-/// TODO: Handle "shuffle (cmp), (cmp)" into "cmp (shuffle), (shuffle)".
+/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
ArrayRef<int> OldMask;
Instruction *LHS, *RHS;
@@ -1636,31 +1636,36 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
return false;
- BinaryOperator *B0, *B1;
- if (!match(LHS, m_BinOp(B0)) || !match(RHS, m_BinOp(B1)))
- return false;
-
- // Don't introduce poison into div/rem.
- if (llvm::is_contained(OldMask, PoisonMaskElem) && B0->isIntDivRem())
+ // TODO: Add support for addlike etc.
+ if (LHS->getOpcode() != RHS->getOpcode())
return false;
- // TODO: Add support for addlike etc.
- Instruction::BinaryOps Opcode = B0->getOpcode();
- if (Opcode != B1->getOpcode())
+ Value *X, *Y, *Z, *W;
+ bool IsCommutative = false;
+ CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
+ match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
+ auto *BO = cast<BinaryOperator>(LHS);
+ // Don't introduce poison into div/rem.
+ if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
+ return false;
+ IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
+ } else if (match(LHS, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
+ match(RHS, m_SpecificCmp(Pred, m_Value(Z), m_Value(W)))) {
+ IsCommutative = cast<CmpInst>(LHS)->isCommutative();
+ } else
return false;
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
- auto *BinOpTy = dyn_cast<FixedVectorType>(LHS->getType());
- if (!ShuffleDstTy || !BinOpTy)
+ auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
+ auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
+ if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
return false;
unsigned NumSrcElts = BinOpTy->getNumElements();
// If we have something like "add X, Y" and "add Z, X", swap ops to match.
- Value *X = LHS->getOperand(0), *Y = LHS->getOperand(1);
- Value *Z = RHS->getOperand(0), *W = RHS->getOperand(1);
- if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
- (X == W || Y == Z))
+ if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
std::swap(X, Y);
auto ConvertToUnary = [NumSrcElts](int &M) {
@@ -1688,13 +1693,22 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
InstructionCost OldCost =
TTI.getInstructionCost(LHS, CostKind) +
TTI.getInstructionCost(RHS, CostKind) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
InstructionCost NewCost =
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
- TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) +
- TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
+ TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
+
+ if (Pred == CmpInst::BAD_ICMP_PREDICATE) {
+ NewCost +=
+ TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
+ } else {
+ auto *ShuffleCmpTy =
+ FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
+ NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
+ ShuffleDstTy, Pred, CostKind);
+ }
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -1704,7 +1718,10 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
- Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+ Value *NewBO = Pred == CmpInst::BAD_ICMP_PREDICATE
+ ? Builder.CreateBinOp(
+ cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
+ : Builder.CreateCmp(Pred, Shuf0, Shuf1);
// Intersect flags from the old binops.
if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index 008c1e7e694b96..b3360b61e66e81 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -1,21 +1,37 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
declare void @use(<4 x i1>)
; icmp - eq v4i32 is cheap
define <4 x i32> @shuf_icmp_eq_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) {
-; CHECK-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
-; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT: [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
+; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT: [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
+; SSE-NEXT: [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
+; SSE-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
+; AVX2-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX2-NEXT: [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
+; AVX2-NEXT: [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
+; AVX2-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
+; AVX512-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX512-NEXT: [[S:%.*]] = icmp eq <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT: ret <4 x i32> [[R]]
;
%c0 = icmp eq <4 x i32> %x, %y
%c1 = icmp eq <4 x i32> %z, %w
@@ -27,13 +43,37 @@ define <4 x i32> @shuf_icmp_eq_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <
; icmp - eq v2i64 is only cheap on SSE4+ targets with PCMPEQQ
define <2 x i64> @shuf_icmp_eq_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z, <2 x i64> %w) {
-; CHECK-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
-; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
-; CHECK-NEXT: [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
-; CHECK-NEXT: ret <2 x i64> [[R]]
+; SSE2-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; SSE2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT: [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; SSE2-NEXT: ret <2 x i64> [[R]]
+;
+; SSE4-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; SSE4-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; SSE4-NEXT: [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
+; SSE4-NEXT: [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
+; SSE4-NEXT: [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
+; SSE4-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; SSE4-NEXT: ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; AVX2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
+; AVX2-NEXT: [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
+; AVX2-NEXT: [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
+; AVX2-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; AVX2-NEXT: ret <2 x i64> [[R]]
+;
+; AVX512-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; AVX512-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT: [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; AVX512-NEXT: ret <2 x i64> [[R]]
;
%c0 = icmp eq <2 x i64> %x, %y
%c1 = icmp eq <2 x i64> %z, %w
@@ -46,10 +86,10 @@ define <2 x i64> @shuf_icmp_eq_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z, <
define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) {
; CHECK-LABEL: define <4 x i32> @shuf_icmp_ugt_v4i32(
-; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[C0:%.*]] = icmp ugt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT: [[C1:%.*]] = icmp ugt <4 x i32> [[Z]], [[W]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[S:%.*]] = icmp ugt <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
; CHECK-NEXT: ret <4 x i32> [[R]]
;
@@ -60,16 +100,32 @@ define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z,
ret <4 x i32> %r
}
-; Common operand is op0 of the fcmps.
+; Common operand is op0 of the fcmps (CMPPS cheaper on SSE4+).
define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
-; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
-; CHECK-NEXT: [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; SSE2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE2-NEXT: [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE2-NEXT: ret <4 x i32> [[R]]
+;
+; SSE4-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; SSE4-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE4-NEXT: [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; SSE4-NEXT: [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; SSE4-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE4-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE4-NEXT: ret <4 x i32> [[R]]
+;
+; AVX-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; AVX-NEXT: [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; AVX-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX-NEXT: ret <4 x i32> [[R]]
;
%b0 = fcmp oeq <4 x float> %x, %y
%b1 = fcmp oeq <4 x float> %x, %z
@@ -81,13 +137,29 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
; For commutative instructions, common operand may be swapped
define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
-; CHECK-NEXT: [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; SSE-NEXT: [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
+; AVX2-NEXT: [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
+; AVX2-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; AVX2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; AVX512-NEXT: [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT: ret <4 x i32> [[R]]
;
%b0 = fcmp one <4 x float> %x, %y
%b1 = fcmp one <4 x float> %z, %x
@@ -99,13 +171,29 @@ define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x
; non-commutative pred, but common op0
define <4 x i32> @shuf_icmp_sgt_v4i32_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
-; CHECK-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
-; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT: [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
-; CHECK-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
+; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
+; SSE-NEXT: [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
+; SSE-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE-NEXT: ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
+; AVX2-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
+; AVX2-NEXT: [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
+; AVX2-NEXT: [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; AVX2-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT: ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
+; AVX512-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 2>
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; AVX512-NEXT: [[S:%.*]] = icmp sgt <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT: ret <4 x i32> [[R]]
;
%b0 = icmp sgt <4 x i32> %x, %y
%b1 = icmp sgt <4 x i32> %x, %z
|
; SSE2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] { | ||
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3> | ||
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3> | ||
; SSE2-NEXT: [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]] | ||
; SSE2-NEXT: [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64> | ||
; SSE2-NEXT: ret <2 x i64> [[R]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this really cheaper?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pre-SSE4 - yes: https://zig.godbolt.org/z/nxEnGMrcn
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/10417 Here is the relevant piece of the build log for the reference
|
Extend binary instructions matching to match compare instructions + predicate as well.