[VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops as well #120075

RKSimon · 2024-12-16T13:05:39Z

Extend binary instructions matching to match compare instructions + predicate as well.

…as well Extend binary instructions matching to match compare instructions + predicate as well.

llvmbot · 2024-12-16T13:06:13Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Simon Pilgrim (RKSimon)

Changes

Extend binary instructions matching to match compare instructions + predicate as well.

Full diff: https://github.com/llvm/llvm-project/pull/120075.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+37-20)
(modified) llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll (+132-44)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e0304944df3c0c..db77d6c955792c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1628,7 +1628,7 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
 }
 
 /// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
-/// TODO: Handle "shuffle (cmp), (cmp)" into "cmp (shuffle), (shuffle)".
+/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
 bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   ArrayRef<int> OldMask;
   Instruction *LHS, *RHS;
@@ -1636,31 +1636,36 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
                            m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
     return false;
 
-  BinaryOperator *B0, *B1;
-  if (!match(LHS, m_BinOp(B0)) || !match(RHS, m_BinOp(B1)))
-    return false;
-
-  // Don't introduce poison into div/rem.
-  if (llvm::is_contained(OldMask, PoisonMaskElem) && B0->isIntDivRem())
+  // TODO: Add support for addlike etc.
+  if (LHS->getOpcode() != RHS->getOpcode())
     return false;
 
-  // TODO: Add support for addlike etc.
-  Instruction::BinaryOps Opcode = B0->getOpcode();
-  if (Opcode != B1->getOpcode())
+  Value *X, *Y, *Z, *W;
+  bool IsCommutative = false;
+  CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+  if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
+      match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
+    auto *BO = cast<BinaryOperator>(LHS);
+    // Don't introduce poison into div/rem.
+    if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
+      return false;
+    IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
+  } else if (match(LHS, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
+             match(RHS, m_SpecificCmp(Pred, m_Value(Z), m_Value(W)))) {
+    IsCommutative = cast<CmpInst>(LHS)->isCommutative();
+  } else
     return false;
 
   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
-  auto *BinOpTy = dyn_cast<FixedVectorType>(LHS->getType());
-  if (!ShuffleDstTy || !BinOpTy)
+  auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
+  auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
+  if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
     return false;
 
   unsigned NumSrcElts = BinOpTy->getNumElements();
 
   // If we have something like "add X, Y" and "add Z, X", swap ops to match.
-  Value *X = LHS->getOperand(0), *Y = LHS->getOperand(1);
-  Value *Z = RHS->getOperand(0), *W = RHS->getOperand(1);
-  if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
-      (X == W || Y == Z))
+  if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
     std::swap(X, Y);
 
   auto ConvertToUnary = [NumSrcElts](int &M) {
@@ -1688,13 +1693,22 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   InstructionCost OldCost =
       TTI.getInstructionCost(LHS, CostKind) +
       TTI.getInstructionCost(RHS, CostKind) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
                          OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
 
   InstructionCost NewCost =
       TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
-      TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}) +
-      TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
+      TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
+
+  if (Pred == CmpInst::BAD_ICMP_PREDICATE) {
+    NewCost +=
+        TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
+  } else {
+    auto *ShuffleCmpTy =
+        FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
+    NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
+                                      ShuffleDstTy, Pred, CostKind);
+  }
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -1704,7 +1718,10 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
 
   Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
   Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
-  Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+  Value *NewBO = Pred == CmpInst::BAD_ICMP_PREDICATE
+                     ? Builder.CreateBinOp(
+                           cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
+                     : Builder.CreateCmp(Pred, Shuf0, Shuf1);
 
   // Intersect flags from the old binops.
   if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index 008c1e7e694b96..b3360b61e66e81 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -1,21 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 declare void @use(<4 x i1>)
 
 ; icmp - eq v4i32 is cheap
 
 define <4 x i32> @shuf_icmp_eq_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) {
-; CHECK-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
-; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT:    [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
+; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT:    [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
+; SSE-NEXT:    [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
+; SSE-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
+; AVX2-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX2-NEXT:    [[C0:%.*]] = icmp eq <4 x i32> [[X]], [[Y]]
+; AVX2-NEXT:    [[C1:%.*]] = icmp eq <4 x i32> [[Z]], [[W]]
+; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_icmp_eq_v4i32(
+; AVX512-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX512-NEXT:    [[S:%.*]] = icmp eq <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT:    ret <4 x i32> [[R]]
 ;
   %c0 = icmp eq <4 x i32> %x, %y
   %c1 = icmp eq <4 x i32> %z, %w
@@ -27,13 +43,37 @@ define <4 x i32> @shuf_icmp_eq_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <
 ; icmp - eq v2i64 is only cheap on SSE4+ targets with PCMPEQQ
 
 define <2 x i64> @shuf_icmp_eq_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z, <2 x i64> %w) {
-; CHECK-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
-; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
-; CHECK-NEXT:    [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[R]]
+; SSE2-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; SSE2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; SSE2-NEXT:    ret <2 x i64> [[R]]
+;
+; SSE4-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; SSE4-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; SSE4-NEXT:    [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
+; SSE4-NEXT:    [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
+; SSE4-NEXT:    [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
+; SSE4-NEXT:    [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; SSE4-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; AVX2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[C0:%.*]] = icmp eq <2 x i64> [[X]], [[Y]]
+; AVX2-NEXT:    [[C1:%.*]] = icmp eq <2 x i64> [[Z]], [[W]]
+; AVX2-NEXT:    [[S:%.*]] = shufflevector <2 x i1> [[C0]], <2 x i1> [[C1]], <2 x i32> <i32 1, i32 3>
+; AVX2-NEXT:    [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; AVX2-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX512-LABEL: define <2 x i64> @shuf_icmp_eq_v2i64(
+; AVX512-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT:    [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; AVX512-NEXT:    ret <2 x i64> [[R]]
 ;
   %c0 = icmp eq <2 x i64> %x, %y
   %c1 = icmp eq <2 x i64> %z, %w
@@ -46,10 +86,10 @@ define <2 x i64> @shuf_icmp_eq_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z, <
 
 define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) {
 ; CHECK-LABEL: define <4 x i32> @shuf_icmp_ugt_v4i32(
-; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[C0:%.*]] = icmp ugt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT:    [[C1:%.*]] = icmp ugt <4 x i32> [[Z]], [[W]]
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[C0]], <4 x i1> [[C1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[S:%.*]] = icmp ugt <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
@@ -60,16 +100,32 @@ define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z,
   ret <4 x i32> %r
 }
 
-; Common operand is op0 of the fcmps.
+; Common operand is op0 of the fcmps (CMPPS cheaper on SSE4+).
 
 define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
-; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
-; CHECK-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; SSE2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE2-NEXT:    [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; SSE4-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; SSE4-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE4-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; SSE4-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; SSE4-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE4-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE4-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; AVX-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; AVX-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = fcmp oeq <4 x float> %x, %y
   %b1 = fcmp oeq <4 x float> %x, %z
@@ -81,13 +137,29 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
 ; For commutative instructions, common operand may be swapped
 
 define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
-; CHECK-NEXT:    [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; SSE-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
+; AVX2-NEXT:    [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
+; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; AVX512-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = fcmp one <4 x float> %x, %y
   %b1 = fcmp one <4 x float> %z, %x
@@ -99,13 +171,29 @@ define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x
 ; non-commutative pred, but common op0
 
 define <4 x i32> @shuf_icmp_sgt_v4i32_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
-; CHECK-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
-; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT:    [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
-; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
+; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
+; SSE-NEXT:    [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
+; SSE-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; SSE-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
+; AVX2-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[B0:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
+; AVX2-NEXT:    [[B1:%.*]] = icmp sgt <4 x i32> [[X]], [[Z]]
+; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_icmp_sgt_v4i32_swap(
+; AVX512-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 2>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; AVX512-NEXT:    [[S:%.*]] = icmp sgt <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = icmp sgt <4 x i32> %x, %y
   %b1 = icmp sgt <4 x i32> %x, %z

alexey-bataev · 2024-12-16T14:34:22Z

llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll

+; SSE2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]], <2 x i64> [[W:%.*]]) #[[ATTR0]] {
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> [[W]], <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[S:%.*]] = icmp eq <2 x i64> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[R:%.*]] = sext <2 x i1> [[S]] to <2 x i64>
+; SSE2-NEXT:    ret <2 x i64> [[R]]


Is this really cheaper?

Pre-SSE4 - yes: https://zig.godbolt.org/z/nxEnGMrcn

alexey-bataev

LG

llvm-ci · 2024-12-16T17:42:06Z

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/10417

Here is the relevant piece of the build log for the reference

Step 7 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: sanitizer/ptr_outside_alloc_2.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa -O3 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/ptr_outside_alloc_2.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/ptr_outside_alloc_2.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa -O3 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/ptr_outside_alloc_2.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/ptr_outside_alloc_2.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# RUN: at line 3
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/ptr_outside_alloc_2.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/ptr_outside_alloc_2.c --check-prefixes=CHECK
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/ptr_outside_alloc_2.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/ptr_outside_alloc_2.c --check-prefixes=CHECK
# .---command stderr------------
# | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/ptr_outside_alloc_2.c:21:11: error: CHECK: expected string not found in input
# | // CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
# |           ^
# | <stdin>:1:1: note: scanning from here
# | AMDGPU error: Error in hsa_amd_memory_pool_allocate: HSA_STATUS_ERROR_OUT_OF_RESOURCES: The runtime failed to allocate the necessary resources. This error may also occur when the core runtime library needs to spawn threads or create internal OS-specific events.
# | ^
# | 
# | Input file: <stdin>
# | Check file: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/ptr_outside_alloc_2.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |           1: AMDGPU error: Error in hsa_amd_memory_pool_allocate: HSA_STATUS_ERROR_OUT_OF_RESOURCES: The runtime failed to allocate the necessary resources. This error may also occur when the core runtime library needs to spawn threads or create internal OS-specific events. 
# | check:21     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |           2: AMDGPU error: Error in hsa_amd_memory_pool_allocate: HSA_STATUS_ERROR_OUT_OF_RESOURCES: The runtime failed to allocate the necessary resources. This error may also occur when the core runtime library needs to spawn threads or create internal OS-specific events. 
# | check:21     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           3: "PluginInterface" error: Failure to allocate device memory: Failed to allocate from memory manager 
# | check:21     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           4: omptarget error: Call to getTargetPointer returned null pointer (device failure or illegal mapping). 
# | check:21     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           5: omptarget error: Call to targetDataBegin failed, abort target. 
# | check:21     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           6: omptarget error: Failed to process data before launching the kernel. 
# | check:21     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           .
# |           .
# |           .
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--

********************

[VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops …

2be7c2b

…as well Extend binary instructions matching to match compare instructions + predicate as well.

RKSimon requested review from alexey-bataev and davemgreen December 16, 2024 13:05

llvmbot added vectorizers llvm:transforms labels Dec 16, 2024

alexey-bataev reviewed Dec 16, 2024

View reviewed changes

alexey-bataev approved these changes Dec 16, 2024

View reviewed changes

RKSimon merged commit 8217c2e into llvm:main Dec 16, 2024
11 checks passed

RKSimon deleted the vectorcombine-shuffle-of-cmpops branch December 16, 2024 17:23

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops as well #120075

[VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops as well #120075

Uh oh!

RKSimon commented Dec 16, 2024

Uh oh!

llvmbot commented Dec 16, 2024 •

edited

Loading

Uh oh!

alexey-bataev Dec 16, 2024

Uh oh!

RKSimon Dec 16, 2024

Uh oh!

alexey-bataev left a comment

Uh oh!

Uh oh!

llvm-ci commented Dec 16, 2024

Uh oh!

Uh oh!

[VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops as well #120075

[VectorCombine] foldShuffleOfBinops - extend to handle icmp/fcmp ops as well #120075

Uh oh!

Conversation

RKSimon commented Dec 16, 2024

Uh oh!

llvmbot commented Dec 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

alexey-bataev Dec 16, 2024

Choose a reason for hiding this comment

Uh oh!

RKSimon Dec 16, 2024

Choose a reason for hiding this comment

Uh oh!

alexey-bataev left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Dec 16, 2024

Uh oh!

Uh oh!

llvmbot commented Dec 16, 2024 •

edited

Loading