Skip to content

Commit f819e4c

Browse files
committed
[X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats
Given a shuffle mask, if it is picking from an input that is splat given the current granularity of the shuffle, then adjust the mask to pick from the same lane of the input as the mask element is in. This may result in a shuffle being simplified into a blend. I believe this is correct given that the splat detection matches the one just above the new code, My basic thought is that we might be able to get less regressions by handling multiple insertions of the same value into a vector if we form broadcasts+blend here, as opposed to D105390, but i have not really thought this through, and did not try implementing it yet. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D107009
1 parent 438f700 commit f819e4c

File tree

6 files changed

+92
-70
lines changed

6 files changed

+92
-70
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35797,6 +35797,19 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3579735797
(RootVT.isFloatingPoint() && Depth >= 1) ||
3579835798
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
3579935799

35800+
// How many elements does each of the inputs have, given the current
35801+
// granularity of the root shuffle? Note that while currently the sizes of an
35802+
// inputs must match the size of the shuffle root,
35803+
// that restriction will be lifted in the future.
35804+
SmallVector<unsigned, 2> InputNumElts;
35805+
llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
35806+
std::back_inserter(InputNumElts),
35807+
[BaseMaskEltSizeInBits](MVT VT) {
35808+
assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
35809+
"Input is not a multiple of output element width?");
35810+
return VT.getSizeInBits() / BaseMaskEltSizeInBits;
35811+
});
35812+
3580035813
// Don't combine if we are a AVX512/EVEX target and the mask element size
3580135814
// is different from the root element size - this would prevent writemasks
3580235815
// from being reused.
@@ -35811,12 +35824,38 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3581135824
// If we are shuffling a broadcast (and not introducing zeros) then
3581235825
// we can just use the broadcast directly. This works for smaller broadcast
3581335826
// elements as well as they already repeat across each mask element
35814-
if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35815-
(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35827+
SmallVector<bool, 2> InputIsSplat;
35828+
llvm::transform(
35829+
std::initializer_list<SDValue>({V1, V2}),
35830+
std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
35831+
return isTargetShuffleSplat(V) &&
35832+
(BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
35833+
});
35834+
if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
3581635835
V1.getValueSizeInBits() >= RootSizeInBits) {
3581735836
return CanonicalizeShuffleInput(RootVT, V1);
3581835837
}
3581935838

35839+
// Adjust mask elements that pick from a splat input to be identity mask elts,
35840+
// i.e. to pick from the same lane of the input as the mask element is in.
35841+
// This may allow to simplify the shuffle into a blend.
35842+
SmallVector<int> NewMask;
35843+
if (InputIsSplat[0] || InputIsSplat[1]) {
35844+
NewMask.assign(BaseMask.begin(), BaseMask.end());
35845+
for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
35846+
int &M = NewMask[i];
35847+
assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
35848+
"OOB mask element?");
35849+
if (M < 0)
35850+
continue; // Keep the undef/zero mask elements as-is.
35851+
int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
35852+
// Is the used input wide-enough to contain that lane, and is it a splat?
35853+
if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
35854+
M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
35855+
}
35856+
BaseMask = std::move(NewMask);
35857+
}
35858+
3582035859
// See if the shuffle is a hidden identity shuffle - repeated args in HOPs
3582135860
// etc. can be simplified.
3582235861
if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {

llvm/test/CodeGen/X86/avx.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -153,23 +153,23 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
153153
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
154154
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
155155
; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
156-
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
157-
; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
156+
; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
157+
; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
158158
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
159-
; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
160-
; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
159+
; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
160+
; X32-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
161161
; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1
162162
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
163163
; X32-NEXT: retl
164164
;
165165
; X64-LABEL: insertps_from_broadcast_multiple_use:
166166
; X64: ## %bb.0:
167167
; X64-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4
168-
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
169-
; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
168+
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
169+
; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
170170
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
171-
; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
172-
; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
171+
; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
172+
; X64-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
173173
; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1
174174
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
175175
; X64-NEXT: retq

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4315,7 +4315,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double
43154315
; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
43164316
; CHECK-FAST: # %bb.0:
43174317
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4318-
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6]
4318+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,2,7]
43194319
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
43204320
; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
43214321
; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
@@ -4340,7 +4340,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub
43404340
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
43414341
; CHECK-FAST: # %bb.0:
43424342
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4343-
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6]
4343+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,5,2,7]
43444344
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
43454345
; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
43464346
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}

llvm/test/CodeGen/X86/pr15296.ll

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -26,28 +26,11 @@ allocas:
2626
define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
2727
; CHECK-LABEL: shiftInput___canonical:
2828
; CHECK: # %bb.0: # %allocas
29-
; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1
30-
; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
31-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
32-
; CHECK-NEXT: vpsrld %xmm2, %xmm3, %xmm4
33-
; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm5
34-
; CHECK-NEXT: vpsrld %xmm5, %xmm3, %xmm6
35-
; CHECK-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
36-
; CHECK-NEXT: vpxor %xmm6, %xmm6, %xmm6
37-
; CHECK-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7]
38-
; CHECK-NEXT: vpsrld %xmm6, %xmm3, %xmm7
39-
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
40-
; CHECK-NEXT: vpsrld %xmm1, %xmm3, %xmm3
41-
; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
42-
; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
43-
; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm2
44-
; CHECK-NEXT: vpsrld %xmm5, %xmm0, %xmm4
45-
; CHECK-NEXT: vpsrld %xmm6, %xmm0, %xmm5
46-
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0
47-
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7]
48-
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
49-
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
50-
; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
29+
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
30+
; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
31+
; CHECK-NEXT: vpsrld %xmm2, %xmm1, %xmm1
32+
; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm0
33+
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
5134
; CHECK-NEXT: retl
5235
allocas:
5336
%smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0

llvm/test/CodeGen/X86/sse41.ll

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
16611661
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
16621662
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
16631663
; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1664-
; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1665-
; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
1666-
; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1667-
; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
1664+
; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1665+
; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
1666+
; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1667+
; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
16681668
; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1669-
; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1670-
; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
1671-
; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1672-
; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
1669+
; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
1670+
; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
1671+
; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
1672+
; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
16731673
; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
16741674
; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
16751675
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
@@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
16791679
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
16801680
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
16811681
; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1682-
; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1683-
; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
1684-
; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1685-
; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
1682+
; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1683+
; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
1684+
; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1685+
; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
1686+
; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
1687+
; X86-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
1688+
; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
1689+
; X86-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
16861690
; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1687-
; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1688-
; X86-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
1689-
; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1690-
; X86-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
1691-
; X86-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
1691+
; X86-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
16921692
; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
16931693
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
16941694
;
@@ -1712,32 +1712,32 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
17121712
; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
17131713
; X64-AVX1: ## %bb.0:
17141714
; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1715-
; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1716-
; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
1717-
; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1718-
; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
1715+
; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1716+
; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
1717+
; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1718+
; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
17191719
; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1720-
; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1721-
; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
1722-
; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1723-
; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
1720+
; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
1721+
; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
1722+
; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
1723+
; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
17241724
; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
17251725
; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
17261726
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
17271727
;
17281728
; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
17291729
; X64-AVX512: ## %bb.0:
17301730
; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1731-
; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1732-
; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
1733-
; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1734-
; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
1731+
; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1732+
; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
1733+
; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1734+
; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
1735+
; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
1736+
; X64-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
1737+
; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
1738+
; X64-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
17351739
; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1736-
; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1737-
; X64-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
1738-
; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1739-
; X64-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
1740-
; X64-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
1740+
; X64-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
17411741
; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
17421742
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
17431743
%1 = getelementptr inbounds float, float* %fb, i64 %index

llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4591,14 +4591,14 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
45914591
; AVX2: # %bb.0:
45924592
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
45934593
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
4594-
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4594+
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
45954595
; AVX2-NEXT: retq
45964596
;
45974597
; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
45984598
; AVX512VLBW: # %bb.0:
45994599
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
46004600
; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
4601-
; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4601+
; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
46024602
; AVX512VLBW-NEXT: retq
46034603
;
46044604
; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:

0 commit comments

Comments
 (0)