Skip to content

Commit bd17ced

Browse files
committed
Revert "[X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats"
This reverts commits f819e4c and 35c0848. It triggers an infinite loop during compilation. $ cat t.ll target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define void @MaxPoolGradGrad_1.65() local_unnamed_addr #0 { entry: %wide.vec78 = load <64 x i32>, <64 x i32>* null, align 16 %strided.vec83 = shufflevector <64 x i32> %wide.vec78, <64 x i32> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60> %0 = lshr <8 x i32> %strided.vec83, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> %1 = add <8 x i32> zeroinitializer, %0 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %3 = shufflevector <16 x i32> %2, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> %interleaved.vec = shufflevector <32 x i32> undef, <32 x i32> %3, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63> store <64 x i32> %interleaved.vec, <64 x i32>* undef, align 16 unreachable } $ llc < t.ll -mcpu=skylake <hang>
1 parent f3f3098 commit bd17ced

File tree

6 files changed

+98
-117
lines changed

6 files changed

+98
-117
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 30 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -35796,19 +35796,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3579635796
(RootVT.isFloatingPoint() && Depth >= 1) ||
3579735797
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
3579835798

35799-
// How many elements does each of the inputs have, given the current
35800-
// granularity of the root shuffle? Note that while currently the sizes of an
35801-
// inputs must match the size of the shuffle root,
35802-
// that restriction will be lifted in the future.
35803-
SmallVector<unsigned, 2> InputNumElts;
35804-
llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
35805-
std::back_inserter(InputNumElts),
35806-
[BaseMaskEltSizeInBits](MVT VT) {
35807-
assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
35808-
"Input is not a multiple of output element width?");
35809-
return VT.getSizeInBits() / BaseMaskEltSizeInBits;
35810-
});
35811-
3581235799
// Don't combine if we are a AVX512/EVEX target and the mask element size
3581335800
// is different from the root element size - this would prevent writemasks
3581435801
// from being reused.
@@ -35823,44 +35810,19 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3582335810
// If we are shuffling a broadcast (and not introducing zeros) then
3582435811
// we can just use the broadcast directly. This works for smaller broadcast
3582535812
// elements as well as they already repeat across each mask element
35826-
SmallVector<bool, 2> InputIsSplat;
35827-
llvm::transform(
35828-
std::initializer_list<SDValue>({V1, V2}),
35829-
std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
35830-
return isTargetShuffleSplat(V) &&
35831-
(BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
35832-
});
35833-
if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
35813+
if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35814+
(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
3583435815
V1.getValueSizeInBits() >= RootSizeInBits) {
3583535816
return CanonicalizeShuffleInput(RootVT, V1);
3583635817
}
3583735818

35838-
SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
35839-
35840-
// Adjust mask elements that pick from a splat input to be identity mask elts,
35841-
// i.e. to pick from the same lane of the input as the mask element is in.
35842-
// This may allow to simplify the shuffle into a blend.
35843-
if (InputIsSplat[0] || InputIsSplat[1]) {
35844-
for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
35845-
int &M = Mask[i];
35846-
assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
35847-
"OOB mask element?");
35848-
if (M < 0)
35849-
continue; // Keep the undef/zero mask elements as-is.
35850-
int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
35851-
// Is the used input wide-enough to contain that lane, and is it a splat?
35852-
if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
35853-
M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
35854-
}
35855-
}
35856-
3585735819
// See if the shuffle is a hidden identity shuffle - repeated args in HOPs
3585835820
// etc. can be simplified.
3585935821
if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
3586035822
SmallVector<int> ScaledMask, IdentityMask;
3586135823
unsigned NumElts = VT1.getVectorNumElements();
35862-
if (Mask.size() <= NumElts &&
35863-
scaleShuffleElements(Mask, NumElts, ScaledMask)) {
35824+
if (BaseMask.size() <= NumElts &&
35825+
scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
3586435826
for (unsigned i = 0; i != NumElts; ++i)
3586535827
IdentityMask.push_back(i);
3586635828
if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
@@ -35874,22 +35836,22 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3587435836
// If the upper subvectors are zeroable, then an extract+insert is more
3587535837
// optimal than using X86ISD::SHUF128. The insertion is free, even if it has
3587635838
// to zero the upper subvectors.
35877-
if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
35839+
if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
3587835840
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
3587935841
return SDValue(); // Nothing to do!
35880-
assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
35842+
assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
3588135843
"Unexpected lane shuffle");
3588235844
Res = CanonicalizeShuffleInput(RootVT, V1);
35883-
unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
35884-
bool UseZero = isAnyZero(Mask);
35845+
unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
35846+
bool UseZero = isAnyZero(BaseMask);
3588535847
Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
3588635848
return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
3588735849
}
3588835850

3588935851
// Narrow shuffle mask to v4x128.
3589035852
SmallVector<int, 4> ScaledMask;
3589135853
assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
35892-
narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
35854+
narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, ScaledMask);
3589335855

3589435856
// Try to lower to vshuf64x2/vshuf32x4.
3589535857
auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
@@ -35948,20 +35910,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3594835910
// If the upper half is zeroable, then an extract+insert is more optimal
3594935911
// than using X86ISD::VPERM2X128. The insertion is free, even if it has to
3595035912
// zero the upper half.
35951-
if (isUndefOrZero(Mask[1])) {
35913+
if (isUndefOrZero(BaseMask[1])) {
3595235914
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
3595335915
return SDValue(); // Nothing to do!
35954-
assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
35916+
assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
3595535917
Res = CanonicalizeShuffleInput(RootVT, V1);
35956-
Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
35957-
return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
35958-
256);
35918+
Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
35919+
return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
35920+
DL, 256);
3595935921
}
3596035922

3596135923
// If we're splatting the low subvector, an insert-subvector 'concat'
3596235924
// pattern is quicker than VPERM2X128.
3596335925
// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
35964-
if (Mask[0] == 0 && Mask[1] == 0 && !Subtarget.hasAVX2()) {
35926+
if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
3596535927
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
3596635928
return SDValue(); // Nothing to do!
3596735929
Res = CanonicalizeShuffleInput(RootVT, V1);
@@ -35976,11 +35938,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3597635938
// we need to use the zeroing feature.
3597735939
// Prefer blends for sequential shuffles unless we are optimizing for size.
3597835940
if (UnaryShuffle &&
35979-
!(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
35980-
(OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
35941+
!(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
35942+
(OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
3598135943
unsigned PermMask = 0;
35982-
PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
35983-
PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
35944+
PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
35945+
PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
3598435946
return DAG.getNode(
3598535947
X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
3598635948
DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
@@ -35991,15 +35953,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3599135953

3599235954
// TODO - handle AVX512VL cases with X86ISD::SHUF128.
3599335955
if (!UnaryShuffle && !IsMaskedShuffle) {
35994-
assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
35956+
assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
3599535957
"Unexpected shuffle sentinel value");
3599635958
// Prefer blends to X86ISD::VPERM2X128.
35997-
if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
35959+
if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
35960+
(BaseMask[0] == 2 && BaseMask[1] == 1))) {
3599835961
unsigned PermMask = 0;
35999-
PermMask |= ((Mask[0] & 3) << 0);
36000-
PermMask |= ((Mask[1] & 3) << 4);
36001-
SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
36002-
SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
35962+
PermMask |= ((BaseMask[0] & 3) << 0);
35963+
PermMask |= ((BaseMask[1] & 3) << 4);
35964+
SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
35965+
SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
3600335966
return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
3600435967
CanonicalizeShuffleInput(RootVT, LHS),
3600535968
CanonicalizeShuffleInput(RootVT, RHS),
@@ -36010,12 +35973,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3601035973

3601135974
// For masks that have been widened to 128-bit elements or more,
3601235975
// narrow back down to 64-bit elements.
35976+
SmallVector<int, 64> Mask;
3601335977
if (BaseMaskEltSizeInBits > 64) {
3601435978
assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
3601535979
int MaskScale = BaseMaskEltSizeInBits / 64;
36016-
SmallVector<int, 64> ScaledMask;
36017-
narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
36018-
Mask = std::move(ScaledMask);
35980+
narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
35981+
} else {
35982+
Mask.assign(BaseMask.begin(), BaseMask.end());
3601935983
}
3602035984

3602135985
// For masked shuffles, we're trying to match the root width for better

llvm/test/CodeGen/X86/avx.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -153,23 +153,23 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
153153
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
154154
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
155155
; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
156-
; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
157-
; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
156+
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
157+
; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
158158
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
159-
; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
160-
; X32-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
159+
; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
160+
; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
161161
; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1
162162
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
163163
; X32-NEXT: retl
164164
;
165165
; X64-LABEL: insertps_from_broadcast_multiple_use:
166166
; X64: ## %bb.0:
167167
; X64-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4
168-
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
169-
; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
168+
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
169+
; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
170170
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
171-
; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
172-
; X64-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
171+
; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
172+
; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
173173
; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1
174174
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
175175
; X64-NEXT: retq

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4315,7 +4315,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double
43154315
; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
43164316
; CHECK-FAST: # %bb.0:
43174317
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4318-
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,2,7]
4318+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6]
43194319
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
43204320
; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
43214321
; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
@@ -4340,7 +4340,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub
43404340
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
43414341
; CHECK-FAST: # %bb.0:
43424342
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4343-
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,5,2,7]
4343+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6]
43444344
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
43454345
; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
43464346
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}

llvm/test/CodeGen/X86/pr15296.ll

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,28 @@ allocas:
2626
define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
2727
; CHECK-LABEL: shiftInput___canonical:
2828
; CHECK: # %bb.0: # %allocas
29-
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
30-
; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
31-
; CHECK-NEXT: vpsrld %xmm2, %xmm1, %xmm1
32-
; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm0
33-
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
29+
; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1
30+
; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
31+
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
32+
; CHECK-NEXT: vpsrld %xmm2, %xmm3, %xmm4
33+
; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm5
34+
; CHECK-NEXT: vpsrld %xmm5, %xmm3, %xmm6
35+
; CHECK-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
36+
; CHECK-NEXT: vpxor %xmm6, %xmm6, %xmm6
37+
; CHECK-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7]
38+
; CHECK-NEXT: vpsrld %xmm6, %xmm3, %xmm7
39+
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
40+
; CHECK-NEXT: vpsrld %xmm1, %xmm3, %xmm3
41+
; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
42+
; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
43+
; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm2
44+
; CHECK-NEXT: vpsrld %xmm5, %xmm0, %xmm4
45+
; CHECK-NEXT: vpsrld %xmm6, %xmm0, %xmm5
46+
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0
47+
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7]
48+
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
49+
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
50+
; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
3451
; CHECK-NEXT: retl
3552
allocas:
3653
%smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0

0 commit comments

Comments
 (0)