Skip to content

Commit 2cbf9fd

Browse files
committed
[DAG] DAGCombiner::visitVECTOR_SHUFFLE - recognise INSERT_SUBVECTOR patterns
IR typically creates INSERT_SUBVECTOR patterns as a widening of the subvector with undefs to pad to the destination size, followed by a shuffle for the actual insertion - SelectionDAGBuilder has to do something similar for shuffles when source/destination vectors are different sizes. This combine attempts to recognize these patterns by looking for a shuffle of a subvector (from a CONCAT_VECTORS) that starts at a modulo of its size into an otherwise identity shuffle of the base vector. This uncovered a couple of target-specific issues as we haven't often created INSERT_SUBVECTOR nodes in generic code - aarch64 could only handle insertions into the bottom of undefs (i.e. a vector widening), and x86-avx512 vXi1 insertion wasn't keeping track of undef elements in the base vector. Fixes PR50053 Differential Revision: https://reviews.llvm.org/D107068
1 parent 38b098b commit 2cbf9fd

File tree

9 files changed

+147
-36
lines changed

9 files changed

+147
-36
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21299,6 +21299,70 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
2129921299
}
2130021300
}
2130121301

21302+
// See if we can replace a shuffle with an insert_subvector.
21303+
// e.g. v2i32 into v8i32:
21304+
// shuffle(lhs,concat(rhs0,rhs1,rhs2,rhs3),0,1,2,3,10,11,6,7).
21305+
// --> insert_subvector(lhs,rhs1,4).
21306+
if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT) &&
21307+
TLI.isOperationLegalOrCustom(ISD::INSERT_SUBVECTOR, VT)) {
21308+
auto ShuffleToInsert = [&](SDValue LHS, SDValue RHS, ArrayRef<int> Mask) {
21309+
// Ensure RHS subvectors are legal.
21310+
assert(RHS.getOpcode() == ISD::CONCAT_VECTORS && "Can't find subvectors");
21311+
EVT SubVT = RHS.getOperand(0).getValueType();
21312+
int NumSubVecs = RHS.getNumOperands();
21313+
int NumSubElts = SubVT.getVectorNumElements();
21314+
assert((NumElts % NumSubElts) == 0 && "Subvector mismatch");
21315+
if (!TLI.isTypeLegal(SubVT))
21316+
return SDValue();
21317+
21318+
// Don't bother if we have an unary shuffle (matches undef + LHS elts).
21319+
if (all_of(Mask, [NumElts](int M) { return M < (int)NumElts; }))
21320+
return SDValue();
21321+
21322+
// Search [NumSubElts] spans for RHS sequence.
21323+
// TODO: Can we avoid nested loops to increase performance?
21324+
SmallVector<int> InsertionMask(NumElts);
21325+
for (int SubVec = 0; SubVec != NumSubVecs; ++SubVec) {
21326+
for (int SubIdx = 0; SubIdx != (int)NumElts; SubIdx += NumSubElts) {
21327+
// Reset mask to identity.
21328+
std::iota(InsertionMask.begin(), InsertionMask.end(), 0);
21329+
21330+
// Add subvector insertion.
21331+
std::iota(InsertionMask.begin() + SubIdx,
21332+
InsertionMask.begin() + SubIdx + NumSubElts,
21333+
NumElts + (SubVec * NumSubElts));
21334+
21335+
// See if the shuffle mask matches the reference insertion mask.
21336+
bool MatchingShuffle = true;
21337+
for (int i = 0; i != (int)NumElts; ++i) {
21338+
int ExpectIdx = InsertionMask[i];
21339+
int ActualIdx = Mask[i];
21340+
if (0 <= ActualIdx && ExpectIdx != ActualIdx) {
21341+
MatchingShuffle = false;
21342+
break;
21343+
}
21344+
}
21345+
21346+
if (MatchingShuffle)
21347+
return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, LHS,
21348+
RHS.getOperand(SubVec),
21349+
DAG.getVectorIdxConstant(SubIdx, SDLoc(N)));
21350+
}
21351+
}
21352+
return SDValue();
21353+
};
21354+
ArrayRef<int> Mask = SVN->getMask();
21355+
if (N1.getOpcode() == ISD::CONCAT_VECTORS)
21356+
if (SDValue InsertN1 = ShuffleToInsert(N0, N1, Mask))
21357+
return InsertN1;
21358+
if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
21359+
SmallVector<int> CommuteMask(Mask.begin(), Mask.end());
21360+
ShuffleVectorSDNode::commuteMask(CommuteMask);
21361+
if (SDValue InsertN0 = ShuffleToInsert(N1, N0, CommuteMask))
21362+
return InsertN0;
21363+
}
21364+
}
21365+
2130221366
// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
2130321367
// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
2130421368
if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -905,6 +905,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
905905
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
906906
setTargetDAGCombine(ISD::TRUNCATE);
907907
setTargetDAGCombine(ISD::CONCAT_VECTORS);
908+
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
908909
setTargetDAGCombine(ISD::STORE);
909910
if (Subtarget->supportsAddressTopByteIgnored())
910911
setTargetDAGCombine(ISD::LOAD);
@@ -13617,6 +13618,48 @@ static SDValue performConcatVectorsCombine(SDNode *N,
1361713618
RHS));
1361813619
}
1361913620

13621+
static SDValue
13622+
performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
13623+
SelectionDAG &DAG) {
13624+
SDValue Vec = N->getOperand(0);
13625+
SDValue SubVec = N->getOperand(1);
13626+
uint64_t IdxVal = N->getConstantOperandVal(2);
13627+
EVT VecVT = Vec.getValueType();
13628+
EVT SubVT = SubVec.getValueType();
13629+
13630+
// Only do this for legal fixed vector types.
13631+
if (!VecVT.isFixedLengthVector() ||
13632+
!DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
13633+
!DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
13634+
return SDValue();
13635+
13636+
// Ignore widening patterns.
13637+
if (IdxVal == 0 && Vec.isUndef())
13638+
return SDValue();
13639+
13640+
// Subvector must be half the width and an "aligned" insertion.
13641+
unsigned NumSubElts = SubVT.getVectorNumElements();
13642+
if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
13643+
(IdxVal != 0 && IdxVal != NumSubElts))
13644+
return SDValue();
13645+
13646+
// Fold insert_subvector -> concat_vectors
13647+
// insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
13648+
// insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
13649+
SDLoc DL(N);
13650+
SDValue Lo, Hi;
13651+
if (IdxVal == 0) {
13652+
Lo = SubVec;
13653+
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
13654+
DAG.getVectorIdxConstant(NumSubElts, DL));
13655+
} else {
13656+
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
13657+
DAG.getVectorIdxConstant(0, DL));
13658+
Hi = SubVec;
13659+
}
13660+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
13661+
}
13662+
1362013663
static SDValue tryCombineFixedPointConvert(SDNode *N,
1362113664
TargetLowering::DAGCombinerInfo &DCI,
1362213665
SelectionDAG &DAG) {
@@ -16673,6 +16716,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
1667316716
return performVectorTruncateCombine(N, DCI, DAG);
1667416717
case ISD::CONCAT_VECTORS:
1667516718
return performConcatVectorsCombine(N, DCI, DAG);
16719+
case ISD::INSERT_SUBVECTOR:
16720+
return performInsertSubvectorCombine(N, DCI, DAG);
1667616721
case ISD::SELECT:
1667716722
return performSelectCombine(N, DCI);
1667816723
case ISD::VSELECT:

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6206,14 +6206,21 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
62066206

62076207
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
62086208
assert(IdxVal != 0 && "Unexpected index");
6209-
NumElems = WideOpVT.getVectorNumElements();
6210-
unsigned ShiftLeft = NumElems - SubVecNumElems;
6211-
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6212-
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6213-
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6214-
if (ShiftRight != 0)
6215-
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6216-
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6209+
// If upper elements of Vec are known undef, then just shift into place.
6210+
if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6211+
[](SDValue V) { return V.isUndef(); })) {
6212+
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6213+
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6214+
} else {
6215+
NumElems = WideOpVT.getVectorNumElements();
6216+
unsigned ShiftLeft = NumElems - SubVecNumElems;
6217+
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6218+
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6219+
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6220+
if (ShiftRight != 0)
6221+
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6222+
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6223+
}
62176224
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
62186225
}
62196226

llvm/test/CodeGen/AArch64/arm64-neon-copy.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1794,7 +1794,7 @@ define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
17941794
; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
17951795
; CHECK: // %bb.0: // %entry
17961796
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
1797-
; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
1797+
; CHECK-NEXT: mov v0.d[1], v1.d[0]
17981798
; CHECK-NEXT: ret
17991799
entry:
18001800
%vecext = extractelement <2 x i64> %x, i32 0

llvm/test/CodeGen/X86/2012-04-26-sdglue.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ define void @func(<4 x float> %a, <16 x i8> %b, <16 x i8> %c, <8 x float> %d, <8
1414
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
1515
; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm0
1616
; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0
17-
; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
1817
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
18+
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1919
; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0
2020
; CHECK-NEXT: vhaddps %ymm4, %ymm0, %ymm0
2121
; CHECK-NEXT: vsubps %ymm0, %ymm0, %ymm0

llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1
8787
; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
8888
; CHECK: # %bb.0:
8989
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
90-
; CHECK-NEXT: vblendps $240, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x0c,0xc0,0xf0]
90+
; CHECK-NEXT: vblendps $15, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x0f]
9191
; CHECK-NEXT: # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9292
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
9393
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)

llvm/test/CodeGen/X86/avx-vperm2x128.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -695,11 +695,9 @@ define void @PR50053(<4 x i64>* nocapture %0, <4 x i64>* nocapture readonly %1)
695695
; ALL-LABEL: PR50053:
696696
; ALL: # %bb.0:
697697
; ALL-NEXT: vmovaps (%rsi), %ymm0
698-
; ALL-NEXT: vmovaps 32(%rsi), %xmm1
699-
; ALL-NEXT: vmovaps 48(%rsi), %xmm2
700-
; ALL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1]
698+
; ALL-NEXT: vinsertf128 $1, 32(%rsi), %ymm0, %ymm1
699+
; ALL-NEXT: vinsertf128 $0, 48(%rsi), %ymm0, %ymm0
701700
; ALL-NEXT: vmovaps %ymm1, (%rdi)
702-
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
703701
; ALL-NEXT: vmovaps %ymm0, 32(%rdi)
704702
; ALL-NEXT: vzeroupper
705703
; ALL-NEXT: retq

llvm/test/CodeGen/X86/pr34592.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,35 +14,35 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
1414
; CHECK-NEXT: vmovaps %ymm4, %ymm10
1515
; CHECK-NEXT: vmovaps %ymm3, %ymm9
1616
; CHECK-NEXT: vmovaps %ymm1, %ymm8
17-
; CHECK-NEXT: vmovaps %ymm0, %ymm3
17+
; CHECK-NEXT: vmovaps %ymm0, %ymm4
1818
; CHECK-NEXT: vmovaps 240(%rbp), %ymm1
19-
; CHECK-NEXT: vmovaps 208(%rbp), %ymm4
19+
; CHECK-NEXT: vmovaps 208(%rbp), %ymm3
2020
; CHECK-NEXT: vmovaps 176(%rbp), %ymm0
2121
; CHECK-NEXT: vmovaps 144(%rbp), %ymm0
2222
; CHECK-NEXT: vmovaps 112(%rbp), %ymm11
2323
; CHECK-NEXT: vmovaps 80(%rbp), %ymm11
2424
; CHECK-NEXT: vmovaps 48(%rbp), %ymm11
2525
; CHECK-NEXT: vmovaps 16(%rbp), %ymm11
26-
; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm2[6,7]
27-
; CHECK-NEXT: vmovaps %xmm4, %xmm6
26+
; CHECK-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm2[6,7]
27+
; CHECK-NEXT: vmovaps %xmm3, %xmm6
2828
; CHECK-NEXT: # implicit-def: $ymm2
2929
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
30-
; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
30+
; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3131
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0]
3232
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
3333
; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2
3434
; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero
3535
; CHECK-NEXT: # implicit-def: $ymm2
3636
; CHECK-NEXT: vmovaps %xmm6, %xmm2
37-
; CHECK-NEXT: # kill: def $xmm3 killed $xmm3 killed $ymm3
38-
; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
39-
; CHECK-NEXT: vmovaps %xmm7, %xmm3
40-
; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
41-
; CHECK-NEXT: # implicit-def: $ymm3
42-
; CHECK-NEXT: vmovaps %xmm6, %xmm3
43-
; CHECK-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
44-
; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
45-
; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
37+
; CHECK-NEXT: # kill: def $xmm4 killed $xmm4 killed $ymm4
38+
; CHECK-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
39+
; CHECK-NEXT: vmovaps %xmm7, %xmm4
40+
; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
41+
; CHECK-NEXT: # implicit-def: $ymm4
42+
; CHECK-NEXT: vmovaps %xmm6, %xmm4
43+
; CHECK-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
44+
; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3]
45+
; CHECK-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
4646
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7]
4747
; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
4848
; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5]

llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -563,9 +563,7 @@ define <16 x float> @insert_sub0_0(<16 x float> %base, <4 x float> %sub1, <4 x f
563563
define <16 x float> @insert_sub1_12(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) {
564564
; ALL-LABEL: insert_sub1_12:
565565
; ALL: # %bb.0:
566-
; ALL-NEXT: vinsertf32x4 $1, %xmm2, %zmm0, %zmm1
567-
; ALL-NEXT: vmovapd {{.*#+}} zmm2 = [0,1,2,3,4,5,10,11]
568-
; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
566+
; ALL-NEXT: vinsertf32x4 $3, %xmm2, %zmm0, %zmm0
569567
; ALL-NEXT: retq
570568
%sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
571569
%sub34 = shufflevector <4 x float> %sub3, <4 x float> %sub4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -591,8 +589,8 @@ define <16 x float> @insert_sub2_4(<16 x float> %base, <4 x float> %sub1, <4 x f
591589
define <16 x float> @insert_sub01_8(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) {
592590
; ALL-LABEL: insert_sub01_8:
593591
; ALL: # %bb.0:
594-
; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
595-
; ALL-NEXT: vinsertf32x4 $1, %xmm2, %zmm1, %zmm1
592+
; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
593+
; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
596594
; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
597595
; ALL-NEXT: retq
598596
%sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -607,8 +605,7 @@ define <16 x float> @insert_sub23_0(<16 x float> %base, <4 x float> %sub1, <4 x
607605
; ALL: # %bb.0:
608606
; ALL-NEXT: # kill: def $xmm3 killed $xmm3 def $ymm3
609607
; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm1
610-
; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm1
611-
; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7]
608+
; ALL-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0
612609
; ALL-NEXT: retq
613610
%sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
614611
%sub34 = shufflevector <4 x float> %sub3, <4 x float> %sub4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

0 commit comments

Comments
 (0)