Skip to content

Commit 992fad0

Browse files
committed
[X86][AVX] Replace extract_subvector(broadcast(), 0) folds with generic SimplifyDemandedVectorEltsForTargetNode handling.
Simplifies a few more cases, notably shuffle demanded elts cases.
1 parent 7722494 commit 992fad0

File tree

7 files changed

+51
-43
lines changed

7 files changed

+51
-43
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38007,7 +38007,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
3800738007
SDValue Src = Op.getOperand(0);
3800838008
MVT SrcVT = Src.getSimpleValueType();
3800938009
if (!SrcVT.isVector())
38010-
return false;
38010+
break;
3801138011
// Don't bother broadcasting if we just need the 0'th element.
3801238012
if (DemandedElts == 1) {
3801338013
if (Src.getValueType() != VT)
@@ -38060,6 +38060,33 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
3806038060
ExtSizeInBits = SizeInBits / 4;
3806138061

3806238062
switch (Opc) {
38063+
// Scalar broadcast.
38064+
case X86ISD::VBROADCAST: {
38065+
SDLoc DL(Op);
38066+
SDValue Src = Op.getOperand(0);
38067+
if (Src.getValueSizeInBits() > ExtSizeInBits)
38068+
Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
38069+
EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38070+
ExtSizeInBits / VT.getScalarSizeInBits());
38071+
SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
38072+
return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38073+
TLO.DAG, DL, ExtSizeInBits));
38074+
}
38075+
case X86ISD::VBROADCAST_LOAD: {
38076+
SDLoc DL(Op);
38077+
auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
38078+
EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38079+
ExtSizeInBits / VT.getScalarSizeInBits());
38080+
SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
38081+
SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
38082+
SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
38083+
X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
38084+
MemIntr->getMemOperand());
38085+
TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
38086+
Bcst.getValue(1));
38087+
return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38088+
TLO.DAG, DL, ExtSizeInBits));
38089+
}
3806338090
// Subvector broadcast.
3806438091
case X86ISD::SUBV_BROADCAST: {
3806538092
SDLoc DL(Op);
@@ -49365,27 +49392,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
4936549392
InVec.getOperand(1), InVec.getOperand(2));
4936649393
}
4936749394

49368-
// If we're extracting from a broadcast then we're better off just
49369-
// broadcasting to the smaller type directly, assuming this is the only use.
49370-
// As its a broadcast we don't care about the extraction index.
49371-
if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
49372-
InVec.getOperand(0).getValueSizeInBits() <= SizeInBits)
49373-
return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
49374-
49375-
if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
49376-
auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
49377-
if (MemIntr->getMemoryVT().getSizeInBits() <= SizeInBits) {
49378-
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
49379-
SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
49380-
SDValue BcastLd =
49381-
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
49382-
MemIntr->getMemoryVT(),
49383-
MemIntr->getMemOperand());
49384-
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
49385-
return BcastLd;
49386-
}
49387-
}
49388-
4938949395
// If we're extracting an upper subvector from a broadcast we should just
4939049396
// extract the lowest subvector instead which should allow
4939149397
// SimplifyDemandedVectorElts do more simplifications.

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3399,9 +3399,10 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float
33993399
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
34003400
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
34013401
; CHECK: # %bb.0:
3402-
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3403-
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [60129542148,60129542148,60129542148,60129542148]
3404-
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3402+
; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148]
3403+
; CHECK-NEXT: # xmm2 = mem[0,0]
3404+
; CHECK-NEXT: vmovaps 32(%rdi), %ymm3
3405+
; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3
34053406
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
34063407
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
34073408
; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
@@ -3417,11 +3418,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>*
34173418
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
34183419
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
34193420
; CHECK: # %bb.0:
3420-
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3421-
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [60129542148,60129542148,60129542148,60129542148]
3421+
; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148]
3422+
; CHECK-NEXT: # xmm2 = mem[0,0]
3423+
; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
34223424
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
34233425
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3424-
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3426+
; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
34253427
; CHECK-NEXT: vmovaps %xmm1, %xmm0
34263428
; CHECK-NEXT: vzeroupper
34273429
; CHECK-NEXT: retq

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2027,7 +2027,7 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
20272027
; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
20282028
; AVX2-SLOW-NEXT: vxorps %xmm2, %xmm2, %xmm2
20292029
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
2030-
; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %ymm1
2030+
; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm1
20312031
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
20322032
; AVX2-SLOW-NEXT: retq
20332033
;

llvm/test/CodeGen/X86/vec_int_to_fp.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -734,8 +734,8 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
734734
;
735735
; AVX2-LABEL: uitofp_4i32_to_2f64:
736736
; AVX2: # %bb.0:
737-
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
738737
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
738+
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
739739
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
740740
; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0
741741
; AVX2-NEXT: retq

llvm/test/CodeGen/X86/vector-fshl-rot-256.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -672,8 +672,8 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
672672
;
673673
; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
674674
; XOPAVX2: # %bb.0:
675-
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
676675
; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
676+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
677677
; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2
678678
; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
679679
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -769,8 +769,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
769769
;
770770
; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
771771
; XOPAVX2: # %bb.0:
772-
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
773772
; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
773+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
774774
; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2
775775
; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
776776
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -891,8 +891,8 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
891891
;
892892
; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
893893
; XOPAVX2: # %bb.0:
894-
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
895894
; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
895+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
896896
; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2
897897
; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
898898
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -1073,8 +1073,8 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
10731073
;
10741074
; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
10751075
; XOPAVX2: # %bb.0:
1076-
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
10771076
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1077+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
10781078
; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2
10791079
; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
10801080
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0

llvm/test/CodeGen/X86/vector-rotate-256.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -647,8 +647,8 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
647647
;
648648
; XOPAVX2-LABEL: splatvar_rotate_v4i64:
649649
; XOPAVX2: # %bb.0:
650-
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
651650
; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
651+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
652652
; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2
653653
; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
654654
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -747,8 +747,8 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
747747
;
748748
; XOPAVX2-LABEL: splatvar_rotate_v8i32:
749749
; XOPAVX2: # %bb.0:
750-
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
751750
; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
751+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
752752
; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2
753753
; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
754754
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -872,8 +872,8 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
872872
;
873873
; XOPAVX2-LABEL: splatvar_rotate_v16i16:
874874
; XOPAVX2: # %bb.0:
875-
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
876875
; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
876+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
877877
; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2
878878
; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
879879
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -979,8 +979,8 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
979979
;
980980
; AVX512BW-LABEL: splatvar_rotate_v32i8:
981981
; AVX512BW: # %bb.0:
982-
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
983982
; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
983+
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
984984
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
985985
; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
986986
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -993,8 +993,8 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
993993
;
994994
; AVX512VLBW-LABEL: splatvar_rotate_v32i8:
995995
; AVX512VLBW: # %bb.0:
996-
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
997996
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
997+
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
998998
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
999999
; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm2
10001000
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1007,8 +1007,8 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
10071007
;
10081008
; AVX512VBMI2-LABEL: splatvar_rotate_v32i8:
10091009
; AVX512VBMI2: # %bb.0:
1010-
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
10111010
; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
1011+
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
10121012
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
10131013
; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2
10141014
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1021,8 +1021,8 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
10211021
;
10221022
; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i8:
10231023
; AVX512VLVBMI2: # %bb.0:
1024-
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
10251024
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
1025+
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
10261026
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
10271027
; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2
10281028
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1045,8 +1045,8 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
10451045
;
10461046
; XOPAVX2-LABEL: splatvar_rotate_v32i8:
10471047
; XOPAVX2: # %bb.0:
1048-
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
10491048
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1049+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
10501050
; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2
10511051
; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
10521052
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0

llvm/test/CodeGen/X86/vector-shift-shl-256.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -669,8 +669,8 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
669669
;
670670
; XOPAVX2-LABEL: splatvar_shift_v32i8:
671671
; XOPAVX2: # %bb.0:
672-
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
673672
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
673+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
674674
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2
675675
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
676676
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0

0 commit comments

Comments
 (0)