[VectorCombine] foldShuffleOfBinops - add support for length changing shuffles

RKSimon · RKSimon · commit eb77e38a751b · 2024-04-22T11:22:52.000+01:00
Refactor to be closer to foldShuffleOfCastops
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1394,60 +1394,91 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   return true;
 }
 
-/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
-/// "binop (shuffle), (shuffle)".
+/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
 bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
-  auto *VecTy = cast<FixedVectorType>(I.getType());
   BinaryOperator *B0, *B1;
-  ArrayRef<int> Mask;
+  ArrayRef<int> OldMask;
   if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
-                           m_Mask(Mask))) ||
-      B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy)
+                           m_Mask(OldMask))))
     return false;
 
   // Don't introduce poison into div/rem.
-  if (any_of(Mask, [](int M) { return M == PoisonMaskElem; }) &&
+  if (any_of(OldMask, [](int M) { return M == PoisonMaskElem; }) &&
       B0->isIntDivRem())
     return false;
 
-  // Try to replace a binop with a shuffle if the shuffle is not costly.
-  // The new shuffle will choose from a single, common operand, so it may be
-  // cheaper than the existing two-operand shuffle.
-  SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size());
+  // TODO: Add support for addlike etc.
   Instruction::BinaryOps Opcode = B0->getOpcode();
-  InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
-  InstructionCost ShufCost = TTI.getShuffleCost(
-      TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask);
-  if (ShufCost > BinopCost)
+  if (Opcode != B1->getOpcode())
+    return false;
+
+  auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
+  auto *BinOpTy = dyn_cast<FixedVectorType>(B0->getType());
+  if (!ShuffleDstTy || !BinOpTy)
     return false;
 
+  unsigned NumSrcElts = BinOpTy->getNumElements();
+
   // If we have something like "add X, Y" and "add Z, X", swap ops to match.
   Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
   Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
   if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W)
+    if (X == W || Y == Z)
     std::swap(X, Y);
 
-  Value *Shuf0, *Shuf1;
+  auto ConvertToUnary = [NumSrcElts](int &M) {
+    if (M >= (int)NumSrcElts)
+      M -= NumSrcElts;
+  };
+
+  SmallVector<int> NewMask0(OldMask.begin(), OldMask.end());
+  TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc;
   if (X == Z) {
-    // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
-    Shuf0 = Builder.CreateShuffleVector(X, UnaryMask);
-    Shuf1 = Builder.CreateShuffleVector(Y, W, Mask);
-  } else if (Y == W) {
-    // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y)
-    Shuf0 = Builder.CreateShuffleVector(X, Z, Mask);
-    Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask);
-  } else {
-    return false;
+    llvm::for_each(NewMask0, ConvertToUnary);
+    SK0 = TargetTransformInfo::SK_PermuteSingleSrc;
+    Z = PoisonValue::get(BinOpTy);
+  }
+
+  SmallVector<int> NewMask1(OldMask.begin(), OldMask.end());
+  TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc;
+  if (Y == W) {
+    llvm::for_each(NewMask1, ConvertToUnary);
+    SK1 = TargetTransformInfo::SK_PermuteSingleSrc;
+    W = PoisonValue::get(BinOpTy);
   }
 
+  // Try to replace a binop with a shuffle if the shuffle is not costly.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  InstructionCost OldCost =
+      TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy) +
+      TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy) +
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
+                         OldMask, CostKind, 0, nullptr, std::nullopt, &I);
+
+  InstructionCost NewCost =
+      TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind) +
+      TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind) +
+      TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy);
+
+  LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
+                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
+  if (NewCost > OldCost)
+    return false;
+
+  Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
+  Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
   Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+
   // Intersect flags from the old binops.
   if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
     NewInst->copyIRFlags(B0);
     NewInst->andIRFlags(B1);
   }
 
-  // TODO: Add Shuf0/Shuf1 to WorkList?
+  Worklist.pushValue(Shuf0);
+  Worklist.pushValue(Shuf1);
   replaceValue(I, *NewBO);
   return true;
 }
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -553,9 +553,8 @@ define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
 ; CHECK-NEXT:    [[STRIDED_VEC31:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 5, i32 13>
 ; CHECK-NEXT:    [[STRIDED_VEC32:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 6, i32 14>
 ; CHECK-NEXT:    [[STRIDED_VEC33:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 7, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
-; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
+; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC35:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 0, i32 8>
 ; CHECK-NEXT:    [[STRIDED_VEC36:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 1, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC37:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 2, i32 10>
@@ -564,32 +563,33 @@ define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
 ; CHECK-NEXT:    [[STRIDED_VEC40:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 5, i32 13>
 ; CHECK-NEXT:    [[STRIDED_VEC41:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 6, i32 14>
 ; CHECK-NEXT:    [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 7, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[STRIDED_VEC35]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[STRIDED_VEC27]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[STRIDED_VEC36]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <2 x double> [[STRIDED_VEC28]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[STRIDED_VEC37]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> [[STRIDED_VEC29]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[STRIDED_VEC38]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <2 x double> [[STRIDED_VEC30]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <2 x double> [[STRIDED_VEC39]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul fast <2 x double> [[STRIDED_VEC31]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[STRIDED_VEC40]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[STRIDED_VEC32]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fadd fast <2 x double> [[STRIDED_VEC41]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or disjoint i64 [[TMP0]], 7
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <2 x double> [[STRIDED_VEC33]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = fadd fast <2 x double> [[STRIDED_VEC42]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i64 -56
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> [[TMP25]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP27]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], ptr [[TMP21]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 7
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -56
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[STRIDED_VEC35]], <2 x double> [[STRIDED_VEC36]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[STRIDED_VEC]], <2 x double> [[STRIDED_VEC27]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[STRIDED_VEC37]], <2 x double> [[STRIDED_VEC38]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[STRIDED_VEC28]], <2 x double> [[STRIDED_VEC29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[STRIDED_VEC39]], <2 x double> [[STRIDED_VEC40]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[STRIDED_VEC30]], <2 x double> [[STRIDED_VEC31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[STRIDED_VEC41]], <2 x double> [[STRIDED_VEC42]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[STRIDED_VEC32]], <2 x double> [[STRIDED_VEC33]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <8 x double> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd fast <8 x double> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <8 x double> [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <8 x double> [[TMP23]], [[TMP26]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP22]], <8 x double> [[TMP27]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
@@ -84,14 +84,14 @@ define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
   ret <4 x i32> %r
 }
 
-; negative test - common operand, but not commutable
+; common operand, but not commutable (expensive vector shift)
 
 define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT:    [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]]
-; CHECK-NEXT:    [[R1:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
+; CHECK-NEXT:    [[R1:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i32> [[R1]]
 ;
   %b0 = shl <4 x i32> %x, %y
@@ -116,15 +116,22 @@ define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z
   ret <2 x i64> %r
 }
 
-; negative test - type change via shuffle
+; widen vector (SSE - cheaper fmul vs AVX - cheaper shuffle)
 
 define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
-; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]]
-; CHECK-NEXT:    [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
-; CHECK-NEXT:    ret <8 x float> [[R]]
+; SSE-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]]
+; SSE-NEXT:    [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]]
+; SSE-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
+; SSE-NEXT:    ret <8 x float> [[R]]
+;
+; AVX-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
+; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 0, i32 1, i32 1, i32 2>
+; AVX-NEXT:    [[R:%.*]] = fmul <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[R]]
 ;
   %b0 = fmul <4 x float> %x, %y
   %b1 = fmul <4 x float> %z, %x
@@ -168,15 +175,22 @@ define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
   ret <4 x i32> %r
 }
 
-; negative test - must have matching operand
+; must have matching operand (SSE - cheaper shuffle vs AVX - cheaper fadd)
 
 define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
-; CHECK-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op(
-; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[B0:%.*]] = fadd <4 x float> [[X]], [[Y]]
-; CHECK-NEXT:    [[B1:%.*]] = fadd <4 x float> [[Z]], [[W]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    ret <4 x float> [[R]]
+; SSE-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[R:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <4 x float> [[R]]
+;
+; AVX-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op(
+; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[B0:%.*]] = fadd <4 x float> [[X]], [[Y]]
+; AVX-NEXT:    [[B1:%.*]] = fadd <4 x float> [[Z]], [[W]]
+; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX-NEXT:    ret <4 x float> [[R]]
 ;
   %b0 = fadd <4 x float> %x, %y
   %b1 = fadd <4 x float> %z, %w
@@ -216,6 +230,3 @@ define <4 x i32> @shuf_srem_v4i32_poison(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %r
 }
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}