Skip to content

Commit 13f2aee

Browse files
committed
[X86][AVX] Generalize vperm2f128/vperm2i128 patterns to support all legal 256-bit vector types
Remove bitcasts to/from v4x64 types through vperm2f128/vperm2i128 ops to help improve shuffle combining and demanded vector elts folding.
1 parent 6e530a3 commit 13f2aee

10 files changed

+81
-86
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35436,7 +35436,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3543635436
DL, 256);
3543735437
}
3543835438

35439-
MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
3544035439
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
3544135440
return SDValue(); // Nothing to do!
3544235441

@@ -35449,12 +35448,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3544935448
unsigned PermMask = 0;
3545035449
PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
3545135450
PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
35452-
35453-
Res = CanonicalizeShuffleInput(ShuffleVT, V1);
35454-
Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
35455-
DAG.getUNDEF(ShuffleVT),
35456-
DAG.getTargetConstant(PermMask, DL, MVT::i8));
35457-
return DAG.getBitcast(RootVT, Res);
35451+
return DAG.getNode(
35452+
X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
35453+
DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
3545835454
}
3545935455

3546035456
if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
@@ -35470,14 +35466,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3547035466
unsigned PermMask = 0;
3547135467
PermMask |= ((BaseMask[0] & 3) << 0);
3547235468
PermMask |= ((BaseMask[1] & 3) << 4);
35473-
3547435469
SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
3547535470
SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
35476-
Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT,
35477-
CanonicalizeShuffleInput(ShuffleVT, LHS),
35478-
CanonicalizeShuffleInput(ShuffleVT, RHS),
35471+
return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
35472+
CanonicalizeShuffleInput(RootVT, LHS),
35473+
CanonicalizeShuffleInput(RootVT, RHS),
3547935474
DAG.getTargetConstant(PermMask, DL, MVT::i8));
35480-
return DAG.getBitcast(RootVT, Res);
3548135475
}
3548235476
}
3548335477
}
@@ -37323,11 +37317,26 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
3732337317
return SDValue();
3732437318
}
3732537319
case X86ISD::VPERM2X128: {
37320+
// Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
37321+
SDValue LHS = N->getOperand(0);
37322+
SDValue RHS = N->getOperand(1);
37323+
if (LHS.getOpcode() == ISD::BITCAST &&
37324+
(RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
37325+
EVT SrcVT = LHS.getOperand(0).getValueType();
37326+
if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
37327+
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
37328+
DAG.getBitcast(SrcVT, LHS),
37329+
DAG.getBitcast(SrcVT, RHS),
37330+
N->getOperand(2)));
37331+
}
37332+
}
37333+
37334+
// Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
3732637335
if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
37327-
return Res;
37336+
return Res;
3732837337

37329-
// Combine vperm2x128 subvector shuffle with an inner concat pattern.
37330-
// vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
37338+
// Fold vperm2x128 subvector shuffle with an inner concat pattern.
37339+
// vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
3733137340
auto FindSubVector128 = [&](unsigned Idx) {
3733237341
if (Idx > 3)
3733337342
return SDValue();

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 31 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7287,40 +7287,40 @@ let ExeDomain = SSEPackedSingle in {
72877287
let isCommutable = 1 in
72887288
def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
72897289
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
7290-
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7291-
[(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7292-
(i8 timm:$src3))))]>, VEX_4V, VEX_L,
7293-
Sched<[WriteFShuffle256]>;
7290+
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7291+
VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
72947292
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
72957293
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7296-
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7297-
[(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7298-
(i8 timm:$src3)))]>, VEX_4V, VEX_L,
7299-
Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7294+
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7295+
VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
73007296
}
73017297

73027298
// Immediate transform to help with commuting.
73037299
def Perm2XCommuteImm : SDNodeXForm<timm, [{
73047300
return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
73057301
}]>;
73067302

7303+
multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
7304+
def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7305+
(!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
7306+
def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
7307+
(!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
7308+
// Pattern with load in other operand.
7309+
def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
7310+
(!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7311+
(Perm2XCommuteImm timm:$imm))>;
7312+
}
7313+
73077314
let Predicates = [HasAVX] in {
7308-
// Pattern with load in other operand.
7309-
def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7310-
VR256:$src1, (i8 timm:$imm))),
7311-
(VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7315+
defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
7316+
defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
73127317
}
73137318

73147319
let Predicates = [HasAVX1Only] in {
7315-
def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7316-
(VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
7317-
def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7318-
(loadv4i64 addr:$src2), (i8 timm:$imm))),
7319-
(VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
7320-
// Pattern with load in other operand.
7321-
def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7322-
VR256:$src1, (i8 timm:$imm))),
7323-
(VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7320+
defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
7321+
defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
7322+
defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
7323+
defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
73247324
}
73257325

73267326
//===----------------------------------------------------------------------===//
@@ -7689,27 +7689,24 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
76897689
WriteFShuffle256, f256mem>, VEX_W;
76907690

76917691
//===----------------------------------------------------------------------===//
7692-
// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7692+
// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
76937693
//
76947694
let isCommutable = 1 in
76957695
def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
76967696
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
7697-
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7698-
[(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7699-
(i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
7700-
VEX_4V, VEX_L;
7697+
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7698+
Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
77017699
def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
77027700
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7703-
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7704-
[(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
7705-
(i8 timm:$src3)))]>,
7701+
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
77067702
Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
77077703

7708-
let Predicates = [HasAVX2] in
7709-
def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7710-
VR256:$src1, (i8 timm:$imm))),
7711-
(VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7712-
7704+
let Predicates = [HasAVX2] in {
7705+
defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
7706+
defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
7707+
defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
7708+
defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
7709+
}
77137710

77147711
//===----------------------------------------------------------------------===//
77157712
// VINSERTI128 - Insert packed integer values

llvm/test/CodeGen/X86/haddsub-2.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -587,9 +587,8 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
587587
;
588588
; AVX2-LABEL: avx2_vphadd_d_test:
589589
; AVX2: # %bb.0:
590-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
591-
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
592-
; AVX2-NEXT: vphaddd %ymm2, %ymm0, %ymm0
590+
; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
591+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
593592
; AVX2-NEXT: retq
594593
%vecext = extractelement <8 x i32> %A, i32 0
595594
%vecext1 = extractelement <8 x i32> %A, i32 1
@@ -743,9 +742,8 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind {
743742
;
744743
; AVX2-LABEL: avx2_vphadd_w_test:
745744
; AVX2: # %bb.0:
746-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
747-
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
748-
; AVX2-NEXT: vphaddw %ymm2, %ymm0, %ymm0
745+
; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
746+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
749747
; AVX2-NEXT: retq
750748
%vecext = extractelement <16 x i16> %a, i32 0
751749
%vecext1 = extractelement <16 x i16> %a, i32 1

llvm/test/CodeGen/X86/masked_store_trunc.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5138,9 +5138,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
51385138
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
51395139
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
51405140
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
5141-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
5142-
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
5143-
; AVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
5141+
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
5142+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
51445143
; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm1
51455144
; AVX2-NEXT: vpmovmskb %ymm1, %eax
51465145
; AVX2-NEXT: notl %eax

llvm/test/CodeGen/X86/var-permute-256.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -626,16 +626,14 @@ define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices)
626626
; XOP-LABEL: var_shuffle_v8i32_from_v4i32:
627627
; XOP: # %bb.0: # %entry
628628
; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
629-
; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
630629
; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
631-
; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
630+
; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0
632631
; XOP-NEXT: retq
633632
;
634633
; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
635634
; AVX1: # %bb.0: # %entry
636635
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
637-
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
638-
; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
636+
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2
639637
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
640638
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
641639
; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
@@ -1049,16 +1047,14 @@ define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indi
10491047
; XOP-LABEL: var_shuffle_v8f32_from_v4f32:
10501048
; XOP: # %bb.0: # %entry
10511049
; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1052-
; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
10531050
; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1054-
; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
1051+
; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0
10551052
; XOP-NEXT: retq
10561053
;
10571054
; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
10581055
; AVX1: # %bb.0: # %entry
10591056
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1060-
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1061-
; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
1057+
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2
10621058
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10631059
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
10641060
; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3

llvm/test/CodeGen/X86/vector-reduce-and-bool.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -717,9 +717,8 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
717717
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
718718
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
719719
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
720-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
721-
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
722-
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
720+
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
721+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
723722
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
724723
; AVX2-NEXT: vpmovmskb %ymm0, %eax
725724
; AVX2-NEXT: cmpl $-1, %eax

llvm/test/CodeGen/X86/vector-reduce-or-bool.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -700,9 +700,8 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
700700
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
701701
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
702702
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
703-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
704-
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
705-
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
703+
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
704+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
706705
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
707706
; AVX2-NEXT: vpmovmskb %ymm0, %eax
708707
; AVX2-NEXT: testl %eax, %eax

llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -756,9 +756,8 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
756756
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
757757
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
758758
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
759-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
760-
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
761-
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
759+
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
760+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
762761
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
763762
; AVX2-NEXT: vpmovmskb %ymm0, %eax
764763
; AVX2-NEXT: movl %eax, %ecx

llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5964,7 +5964,7 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1
59645964
; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
59655965
; AVX2: # %bb.0:
59665966
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
5967-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
5967+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
59685968
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
59695969
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
59705970
; AVX2-NEXT: retq
@@ -5988,7 +5988,7 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1
59885988
; XOPAVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
59895989
; XOPAVX2: # %bb.0:
59905990
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
5991-
; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
5991+
; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
59925992
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
59935993
; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
59945994
; XOPAVX2-NEXT: retq
@@ -6146,7 +6146,7 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1
61466146
; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
61476147
; AVX2: # %bb.0:
61486148
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
6149-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
6149+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
61506150
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
61516151
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
61526152
; AVX2-NEXT: retq
@@ -6171,7 +6171,7 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1
61716171
; XOPAVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
61726172
; XOPAVX2: # %bb.0:
61736173
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
6174-
; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
6174+
; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
61756175
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
61766176
; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
61776177
; XOPAVX2-NEXT: retq
@@ -6329,7 +6329,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2
63296329
; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
63306330
; AVX2: # %bb.0:
63316331
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
6332-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
6332+
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
63336333
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
63346334
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
63356335
; AVX2-NEXT: retq
@@ -6353,7 +6353,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2
63536353
; XOPAVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
63546354
; XOPAVX2: # %bb.0:
63556355
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
6356-
; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
6356+
; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
63576357
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
63586358
; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
63596359
; XOPAVX2-NEXT: retq
@@ -6407,7 +6407,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
64076407
; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
64086408
; AVX2: # %bb.0:
64096409
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
6410-
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
6410+
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
64116411
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
64126412
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
64136413
; AVX2-NEXT: retq
@@ -6432,7 +6432,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
64326432
; XOPAVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
64336433
; XOPAVX2: # %bb.0:
64346434
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
6435-
; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3]
6435+
; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
64366436
; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
64376437
; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
64386438
; XOPAVX2-NEXT: retq

0 commit comments

Comments
 (0)