Skip to content

Commit eb77e38

Browse files
committed
[VectorCombine] foldShuffleOfBinops - add support for length changing shuffles
Refactor to be closer to foldShuffleOfCastops
1 parent ca9a44e commit eb77e38

File tree

3 files changed

+118
-76
lines changed

3 files changed

+118
-76
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,60 +1394,91 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
13941394
return true;
13951395
}
13961396

1397-
/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
1398-
/// "binop (shuffle), (shuffle)".
1397+
/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
13991398
bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
1400-
auto *VecTy = cast<FixedVectorType>(I.getType());
14011399
BinaryOperator *B0, *B1;
1402-
ArrayRef<int> Mask;
1400+
ArrayRef<int> OldMask;
14031401
if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
1404-
m_Mask(Mask))) ||
1405-
B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy)
1402+
m_Mask(OldMask))))
14061403
return false;
14071404

14081405
// Don't introduce poison into div/rem.
1409-
if (any_of(Mask, [](int M) { return M == PoisonMaskElem; }) &&
1406+
if (any_of(OldMask, [](int M) { return M == PoisonMaskElem; }) &&
14101407
B0->isIntDivRem())
14111408
return false;
14121409

1413-
// Try to replace a binop with a shuffle if the shuffle is not costly.
1414-
// The new shuffle will choose from a single, common operand, so it may be
1415-
// cheaper than the existing two-operand shuffle.
1416-
SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size());
1410+
// TODO: Add support for addlike etc.
14171411
Instruction::BinaryOps Opcode = B0->getOpcode();
1418-
InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
1419-
InstructionCost ShufCost = TTI.getShuffleCost(
1420-
TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask);
1421-
if (ShufCost > BinopCost)
1412+
if (Opcode != B1->getOpcode())
1413+
return false;
1414+
1415+
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
1416+
auto *BinOpTy = dyn_cast<FixedVectorType>(B0->getType());
1417+
if (!ShuffleDstTy || !BinOpTy)
14221418
return false;
14231419

1420+
unsigned NumSrcElts = BinOpTy->getNumElements();
1421+
14241422
// If we have something like "add X, Y" and "add Z, X", swap ops to match.
14251423
Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
14261424
Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
14271425
if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W)
1426+
if (X == W || Y == Z)
14281427
std::swap(X, Y);
14291428

1430-
Value *Shuf0, *Shuf1;
1429+
auto ConvertToUnary = [NumSrcElts](int &M) {
1430+
if (M >= (int)NumSrcElts)
1431+
M -= NumSrcElts;
1432+
};
1433+
1434+
SmallVector<int> NewMask0(OldMask.begin(), OldMask.end());
1435+
TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc;
14311436
if (X == Z) {
1432-
// shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
1433-
Shuf0 = Builder.CreateShuffleVector(X, UnaryMask);
1434-
Shuf1 = Builder.CreateShuffleVector(Y, W, Mask);
1435-
} else if (Y == W) {
1436-
// shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y)
1437-
Shuf0 = Builder.CreateShuffleVector(X, Z, Mask);
1438-
Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask);
1439-
} else {
1440-
return false;
1437+
llvm::for_each(NewMask0, ConvertToUnary);
1438+
SK0 = TargetTransformInfo::SK_PermuteSingleSrc;
1439+
Z = PoisonValue::get(BinOpTy);
1440+
}
1441+
1442+
SmallVector<int> NewMask1(OldMask.begin(), OldMask.end());
1443+
TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc;
1444+
if (Y == W) {
1445+
llvm::for_each(NewMask1, ConvertToUnary);
1446+
SK1 = TargetTransformInfo::SK_PermuteSingleSrc;
1447+
W = PoisonValue::get(BinOpTy);
14411448
}
14421449

1450+
// Try to replace a binop with a shuffle if the shuffle is not costly.
1451+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1452+
1453+
InstructionCost OldCost =
1454+
TTI.getArithmeticInstrCost(B0->getOpcode(), BinOpTy) +
1455+
TTI.getArithmeticInstrCost(B1->getOpcode(), BinOpTy) +
1456+
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy,
1457+
OldMask, CostKind, 0, nullptr, std::nullopt, &I);
1458+
1459+
InstructionCost NewCost =
1460+
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind) +
1461+
TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind) +
1462+
TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy);
1463+
1464+
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
1465+
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1466+
<< "\n");
1467+
if (NewCost > OldCost)
1468+
return false;
1469+
1470+
Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
1471+
Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
14431472
Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
1473+
14441474
// Intersect flags from the old binops.
14451475
if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
14461476
NewInst->copyIRFlags(B0);
14471477
NewInst->andIRFlags(B1);
14481478
}
14491479

1450-
// TODO: Add Shuf0/Shuf1 to WorkList?
1480+
Worklist.pushValue(Shuf0);
1481+
Worklist.pushValue(Shuf1);
14511482
replaceValue(I, *NewBO);
14521483
return true;
14531484
}

llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -553,9 +553,8 @@ define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
553553
; CHECK-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 5, i32 13>
554554
; CHECK-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 6, i32 14>
555555
; CHECK-NEXT: [[STRIDED_VEC33:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 7, i32 15>
556-
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
557-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
558-
; CHECK-NEXT: [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP3]], align 8
556+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
557+
; CHECK-NEXT: [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP2]], align 8
559558
; CHECK-NEXT: [[STRIDED_VEC35:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 0, i32 8>
560559
; CHECK-NEXT: [[STRIDED_VEC36:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 1, i32 9>
561560
; CHECK-NEXT: [[STRIDED_VEC37:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 2, i32 10>
@@ -564,32 +563,33 @@ define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
564563
; CHECK-NEXT: [[STRIDED_VEC40:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 5, i32 13>
565564
; CHECK-NEXT: [[STRIDED_VEC41:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 6, i32 14>
566565
; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 7, i32 15>
567-
; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[STRIDED_VEC35]], [[TMP2]]
568-
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[STRIDED_VEC27]], [[BROADCAST_SPLAT]]
569-
; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[STRIDED_VEC36]], [[TMP5]]
570-
; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[STRIDED_VEC28]], [[BROADCAST_SPLAT]]
571-
; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[STRIDED_VEC37]], [[TMP7]]
572-
; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[STRIDED_VEC29]], [[BROADCAST_SPLAT]]
573-
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[STRIDED_VEC38]], [[TMP9]]
574-
; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x double> [[STRIDED_VEC30]], [[BROADCAST_SPLAT]]
575-
; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[STRIDED_VEC39]], [[TMP11]]
576-
; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x double> [[STRIDED_VEC31]], [[BROADCAST_SPLAT]]
577-
; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[STRIDED_VEC40]], [[TMP13]]
578-
; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[STRIDED_VEC32]], [[BROADCAST_SPLAT]]
579-
; CHECK-NEXT: [[TMP16:%.*]] = fadd fast <2 x double> [[STRIDED_VEC41]], [[TMP15]]
580-
; CHECK-NEXT: [[TMP17:%.*]] = or disjoint i64 [[TMP0]], 7
581-
; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <2 x double> [[STRIDED_VEC33]], [[BROADCAST_SPLAT]]
582-
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP17]]
583-
; CHECK-NEXT: [[TMP20:%.*]] = fadd fast <2 x double> [[STRIDED_VEC42]], [[TMP18]]
584-
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i64 -56
585-
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
586-
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
587-
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
588-
; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
589-
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
590-
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> [[TMP25]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
591-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP27]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
592-
; CHECK-NEXT: store <16 x double> [[INTERLEAVED_VEC]], ptr [[TMP21]], align 8
566+
; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 7
567+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP3]]
568+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -56
569+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[STRIDED_VEC35]], <2 x double> [[STRIDED_VEC36]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
570+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[STRIDED_VEC]], <2 x double> [[STRIDED_VEC27]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
571+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
572+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[STRIDED_VEC37]], <2 x double> [[STRIDED_VEC38]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
573+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[STRIDED_VEC28]], <2 x double> [[STRIDED_VEC29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
574+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
575+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[STRIDED_VEC39]], <2 x double> [[STRIDED_VEC40]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
576+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[STRIDED_VEC30]], <2 x double> [[STRIDED_VEC31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
577+
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
578+
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[STRIDED_VEC41]], <2 x double> [[STRIDED_VEC42]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
579+
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[STRIDED_VEC32]], <2 x double> [[STRIDED_VEC33]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
580+
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
581+
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
582+
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
583+
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
584+
; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <8 x double> [[TMP19]], [[TMP20]]
585+
; CHECK-NEXT: [[TMP22:%.*]] = fadd fast <8 x double> [[TMP18]], [[TMP21]]
586+
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
587+
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
588+
; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
589+
; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <8 x double> [[TMP24]], [[TMP25]]
590+
; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <8 x double> [[TMP23]], [[TMP26]]
591+
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP22]], <8 x double> [[TMP27]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
592+
; CHECK-NEXT: store <16 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
593593
; CHECK-NEXT: ret void
594594
;
595595
entry:

llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,14 @@ define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
8484
ret <4 x i32> %r
8585
}
8686

87-
; negative test - common operand, but not commutable
87+
; common operand, but not commutable (expensive vector shift)
8888

8989
define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
9090
; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap(
9191
; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
92-
; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]]
93-
; CHECK-NEXT: [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]]
94-
; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
92+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Z]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
93+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
94+
; CHECK-NEXT: [[R1:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
9595
; CHECK-NEXT: ret <4 x i32> [[R1]]
9696
;
9797
%b0 = shl <4 x i32> %x, %y
@@ -116,15 +116,22 @@ define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z
116116
ret <2 x i64> %r
117117
}
118118

119-
; negative test - type change via shuffle
119+
; widen vector (SSE - cheaper fmul vs AVX - cheaper shuffle)
120120

121121
define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
122-
; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
123-
; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
124-
; CHECK-NEXT: [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]]
125-
; CHECK-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]]
126-
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
127-
; CHECK-NEXT: ret <8 x float> [[R]]
122+
; SSE-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
123+
; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
124+
; SSE-NEXT: [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]]
125+
; SSE-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]]
126+
; SSE-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
127+
; SSE-NEXT: ret <8 x float> [[R]]
128+
;
129+
; AVX-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
130+
; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
131+
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
132+
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 0, i32 1, i32 1, i32 2>
133+
; AVX-NEXT: [[R:%.*]] = fmul <8 x float> [[TMP1]], [[TMP2]]
134+
; AVX-NEXT: ret <8 x float> [[R]]
128135
;
129136
%b0 = fmul <4 x float> %x, %y
130137
%b1 = fmul <4 x float> %z, %x
@@ -168,15 +175,22 @@ define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
168175
ret <4 x i32> %r
169176
}
170177

171-
; negative test - must have matching operand
178+
; must have matching operand (SSE - cheaper shuffle vs AVX - cheaper fadd)
172179

173180
define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
174-
; CHECK-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op(
175-
; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] {
176-
; CHECK-NEXT: [[B0:%.*]] = fadd <4 x float> [[X]], [[Y]]
177-
; CHECK-NEXT: [[B1:%.*]] = fadd <4 x float> [[Z]], [[W]]
178-
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
179-
; CHECK-NEXT: ret <4 x float> [[R]]
181+
; SSE-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op(
182+
; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] {
183+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
184+
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[W]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
185+
; SSE-NEXT: [[R:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
186+
; SSE-NEXT: ret <4 x float> [[R]]
187+
;
188+
; AVX-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op(
189+
; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] {
190+
; AVX-NEXT: [[B0:%.*]] = fadd <4 x float> [[X]], [[Y]]
191+
; AVX-NEXT: [[B1:%.*]] = fadd <4 x float> [[Z]], [[W]]
192+
; AVX-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
193+
; AVX-NEXT: ret <4 x float> [[R]]
180194
;
181195
%b0 = fadd <4 x float> %x, %y
182196
%b1 = fadd <4 x float> %z, %w
@@ -216,6 +230,3 @@ define <4 x i32> @shuf_srem_v4i32_poison(<4 x i32> %a0, <4 x i32> %a1) {
216230
ret <4 x i32> %r
217231
}
218232

219-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
220-
; AVX: {{.*}}
221-
; SSE: {{.*}}

0 commit comments

Comments
 (0)