Skip to content

Commit e82e17d

Browse files
committed
[X86] Add lowerShuffleAsBitRotate (PR44379)
As noted on PR44379, we didn't attempt to lower vector shuffles using bit rotations on XOP/AVX512F targets. This patch lowers to uniform ISD:ROTL nodes - ROTR isn't supported by XOP and they are interchangeable for constant values anyway. There might be cases where targets without ISD:ROTL support would benefit from this (expanding to SRL+SHL+OR), which I'll investigate in a future patch. Also, non-AVX512BW targets fail to concatenate 256-bit rotations back to 512-bits (split during shuffle lowering as they don't have v32i16/v64i8 types).
1 parent dd26222 commit e82e17d

File tree

7 files changed

+182
-121
lines changed

7 files changed

+182
-121
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11665,6 +11665,66 @@ static SDValue lowerShuffleAsDecomposedShuffleBlend(
1166511665
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
1166611666
}
1166711667

11668+
/// Try to lower a vector shuffle as a bit rotation.
11669+
///
11670+
/// Look for a repeated rotation pattern in each sub group.
11671+
/// Returns a ISD::ROTL element rotation amount or -1 if failed.
11672+
static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
11673+
int NumElts = Mask.size();
11674+
assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
11675+
11676+
int RotateAmt = -1;
11677+
for (int i = 0; i != NumElts; i += NumSubElts) {
11678+
for (int j = 0; j != NumSubElts; ++j) {
11679+
int M = Mask[i + j];
11680+
if (M < 0)
11681+
continue;
11682+
if (!isInRange(M, i, i + NumSubElts))
11683+
return -1;
11684+
int Offset = ((M - i) + (NumSubElts - j)) % NumSubElts;
11685+
if (0 <= RotateAmt && Offset != RotateAmt)
11686+
return -1;
11687+
RotateAmt = Offset;
11688+
}
11689+
}
11690+
return RotateAmt;
11691+
}
11692+
11693+
/// Lower shuffle using ISD::ROTL rotations.
11694+
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
11695+
ArrayRef<int> Mask,
11696+
const X86Subtarget &Subtarget,
11697+
SelectionDAG &DAG) {
11698+
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11699+
11700+
MVT SVT = VT.getScalarType();
11701+
int EltSizeInBits = SVT.getScalarSizeInBits();
11702+
assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11703+
11704+
// Only XOP + AVX512 targets have bit rotation instructions.
11705+
if (!((VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512()))
11706+
return SDValue();
11707+
11708+
// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11709+
int MinSubElts = Subtarget.hasXOP() ? 2 : std::max(32 / EltSizeInBits, 2);
11710+
int MaxSubElts = 64 / EltSizeInBits;
11711+
for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
11712+
int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
11713+
if (RotateAmt < 0)
11714+
continue;
11715+
int NumElts = VT.getVectorNumElements();
11716+
MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11717+
MVT RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11718+
int RotateAmtInBits = RotateAmt * EltSizeInBits;
11719+
SDValue Rot =
11720+
DAG.getNode(ISD::ROTL, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11721+
DAG.getConstant(RotateAmtInBits, DL, RotateVT));
11722+
return DAG.getBitcast(VT, Rot);
11723+
}
11724+
11725+
return SDValue();
11726+
}
11727+
1166811728
/// Try to lower a vector shuffle as a byte rotation.
1166911729
///
1167011730
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
@@ -14220,6 +14280,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1422014280
Mask, Subtarget, DAG))
1422114281
return Broadcast;
1422214282

14283+
// Try to use bit rotation instructions.
14284+
if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14285+
Subtarget, DAG))
14286+
return Rotate;
14287+
1422314288
// Use dedicated unpack instructions for masks that match their pattern.
1422414289
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
1422514290
return V;
@@ -14444,6 +14509,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1444414509
Mask, Subtarget, DAG))
1444514510
return Broadcast;
1444614511

14512+
// Try to use bit rotation instructions.
14513+
if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14514+
Subtarget, DAG))
14515+
return Rotate;
14516+
1444714517
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
1444814518
return V;
1444914519

@@ -16334,6 +16404,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1633416404
return V;
1633516405

1633616406
if (V2.isUndef()) {
16407+
// Try to use bit rotation instructions.
16408+
if (SDValue Rotate =
16409+
lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16410+
return Rotate;
16411+
1633716412
// Try to produce a fixed cross-128-bit lane permute followed by unpack
1633816413
// because that should be faster than the variable permute alternatives.
1633916414
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
@@ -16424,14 +16499,20 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1642416499

1642516500
// Try to use shift instructions.
1642616501
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
16427-
Zeroable, Subtarget, DAG))
16502+
Zeroable, Subtarget, DAG))
1642816503
return Shift;
1642916504

1643016505
// Try to use byte rotation instructions.
1643116506
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
1643216507
Subtarget, DAG))
1643316508
return Rotate;
1643416509

16510+
// Try to use bit rotation instructions.
16511+
if (V2.isUndef())
16512+
if (SDValue Rotate =
16513+
lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16514+
return Rotate;
16515+
1643516516
// Try to create an in-lane repeating shuffle mask and then shuffle the
1643616517
// results into the target lanes.
1643716518
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -16926,6 +17007,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1692617007
return Rotate;
1692717008

1692817009
if (V2.isUndef()) {
17010+
// Try to use bit rotation instructions.
17011+
if (SDValue Rotate =
17012+
lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17013+
return Rotate;
17014+
1692917015
SmallVector<int, 8> RepeatedMask;
1693017016
if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
1693117017
// As this is a single-input shuffle, the repeated mask should be
@@ -16983,6 +17069,12 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1698317069
Subtarget, DAG))
1698417070
return Rotate;
1698517071

17072+
// Try to use bit rotation instructions.
17073+
if (V2.isUndef())
17074+
if (SDValue Rotate =
17075+
lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17076+
return Rotate;
17077+
1698617078
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
1698717079
Zeroable, Subtarget, DAG))
1698817080
return PSHUFB;

llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -479,10 +479,20 @@ define <16 x i8> @shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14(
479479
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
480480
; SSE41-NEXT: retq
481481
;
482-
; AVX-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
483-
; AVX: # %bb.0:
484-
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
485-
; AVX-NEXT: retq
482+
; AVX1-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
483+
; AVX1: # %bb.0:
484+
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
485+
; AVX1-NEXT: retq
486+
;
487+
; AVX2OR512VL-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
488+
; AVX2OR512VL: # %bb.0:
489+
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
490+
; AVX2OR512VL-NEXT: retq
491+
;
492+
; XOP-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
493+
; XOP: # %bb.0:
494+
; XOP-NEXT: vprotw $8, %xmm0, %xmm0
495+
; XOP-NEXT: retq
486496
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
487497
ret <16 x i8> %shuffle
488498
}
@@ -1902,10 +1912,25 @@ define <16 x i8> @shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14(
19021912
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
19031913
; SSE41-NEXT: retq
19041914
;
1905-
; AVX-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1906-
; AVX: # %bb.0:
1907-
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
1908-
; AVX-NEXT: retq
1915+
; AVX1-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1916+
; AVX1: # %bb.0:
1917+
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
1918+
; AVX1-NEXT: retq
1919+
;
1920+
; AVX2-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1921+
; AVX2: # %bb.0:
1922+
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
1923+
; AVX2-NEXT: retq
1924+
;
1925+
; AVX512VL-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1926+
; AVX512VL: # %bb.0:
1927+
; AVX512VL-NEXT: vprold $24, %xmm0, %xmm0
1928+
; AVX512VL-NEXT: retq
1929+
;
1930+
; XOP-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1931+
; XOP: # %bb.0:
1932+
; XOP-NEXT: vprotd $24, %xmm0, %xmm0
1933+
; XOP-NEXT: retq
19091934
%shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14>
19101935
ret <16 x i8> %shuffle
19111936
}
@@ -1937,13 +1962,12 @@ define <16 x i8> @shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09(
19371962
;
19381963
; AVX512VL-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
19391964
; AVX512VL: # %bb.0:
1940-
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9]
1965+
; AVX512VL-NEXT: vprolq $16, %xmm0, %xmm0
19411966
; AVX512VL-NEXT: retq
19421967
;
19431968
; XOP-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
19441969
; XOP: # %bb.0:
1945-
; XOP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1946-
; XOP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1970+
; XOP-NEXT: vprotq $16, %xmm0, %xmm0
19471971
; XOP-NEXT: retq
19481972
%shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9>
19491973
ret <16 x i8> %shuffle

llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2462,21 +2462,14 @@ define <8 x i16> @shuffle_v8i16_10325476(<8 x i16> %a) {
24622462
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
24632463
; AVX2-FAST-NEXT: retq
24642464
;
2465-
; AVX512VL-SLOW-LABEL: shuffle_v8i16_10325476:
2466-
; AVX512VL-SLOW: # %bb.0:
2467-
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2468-
; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
2469-
; AVX512VL-SLOW-NEXT: retq
2470-
;
2471-
; AVX512VL-FAST-LABEL: shuffle_v8i16_10325476:
2472-
; AVX512VL-FAST: # %bb.0:
2473-
; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
2474-
; AVX512VL-FAST-NEXT: retq
2465+
; AVX512VL-LABEL: shuffle_v8i16_10325476:
2466+
; AVX512VL: # %bb.0:
2467+
; AVX512VL-NEXT: vprold $16, %xmm0, %xmm0
2468+
; AVX512VL-NEXT: retq
24752469
;
24762470
; XOP-LABEL: shuffle_v8i16_10325476:
24772471
; XOP: # %bb.0:
2478-
; XOP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2479-
; XOP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
2472+
; XOP-NEXT: vprotd $16, %xmm0, %xmm0
24802473
; XOP-NEXT: retq
24812474
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
24822475
ret <8 x i16> %shuffle
@@ -2506,21 +2499,14 @@ define <8 x i16> @shuffle_v8i16_12305674(<8 x i16> %a) {
25062499
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9]
25072500
; AVX2-FAST-NEXT: retq
25082501
;
2509-
; AVX512VL-SLOW-LABEL: shuffle_v8i16_12305674:
2510-
; AVX512VL-SLOW: # %bb.0:
2511-
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2512-
; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
2513-
; AVX512VL-SLOW-NEXT: retq
2514-
;
2515-
; AVX512VL-FAST-LABEL: shuffle_v8i16_12305674:
2516-
; AVX512VL-FAST: # %bb.0:
2517-
; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9]
2518-
; AVX512VL-FAST-NEXT: retq
2502+
; AVX512VL-LABEL: shuffle_v8i16_12305674:
2503+
; AVX512VL: # %bb.0:
2504+
; AVX512VL-NEXT: vprolq $16, %xmm0, %xmm0
2505+
; AVX512VL-NEXT: retq
25192506
;
25202507
; XOP-LABEL: shuffle_v8i16_12305674:
25212508
; XOP: # %bb.0:
2522-
; XOP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2523-
; XOP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
2509+
; XOP-NEXT: vprotq $16, %xmm0, %xmm0
25242510
; XOP-NEXT: retq
25252511
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
25262512
ret <8 x i16> %shuffle

llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll

Lines changed: 13 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5385,7 +5385,7 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2
53855385
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
53865386
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm3 = xmm3[2,3,0,1],xmm2[2,3,0,1],xmm3[6,7,4,5],xmm2[6,7,4,5]
53875387
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[2,3,0,1,6,7],xmm2[4,5],xmm1[4,5],xmm2[4,5],xmm1[6,7],xmm2[4,5]
5388-
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
5388+
; XOPAVX1-NEXT: vprotd $16, %xmm0, %xmm0
53895389
; XOPAVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
53905390
; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
53915391
; XOPAVX1-NEXT: retq
@@ -7054,24 +7054,16 @@ define <16 x i16> @shuffle_v16i16_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_1
70547054
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
70557055
; AVX2-FAST-NEXT: retq
70567056
;
7057-
; AVX512VL-SLOW-LABEL: shuffle_v16i16_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
7058-
; AVX512VL-SLOW: # %bb.0:
7059-
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
7060-
; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
7061-
; AVX512VL-SLOW-NEXT: retq
7062-
;
7063-
; AVX512VL-FAST-LABEL: shuffle_v16i16_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
7064-
; AVX512VL-FAST: # %bb.0:
7065-
; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
7066-
; AVX512VL-FAST-NEXT: retq
7057+
; AVX512VL-LABEL: shuffle_v16i16_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
7058+
; AVX512VL: # %bb.0:
7059+
; AVX512VL-NEXT: vprold $16, %ymm0, %ymm0
7060+
; AVX512VL-NEXT: retq
70677061
;
70687062
; XOPAVX1-LABEL: shuffle_v16i16_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
70697063
; XOPAVX1: # %bb.0:
7070-
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7]
7071-
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
7064+
; XOPAVX1-NEXT: vprotd $16, %xmm0, %xmm1
70727065
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
7073-
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
7074-
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
7066+
; XOPAVX1-NEXT: vprotd $16, %xmm0, %xmm0
70757067
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
70767068
; XOPAVX1-NEXT: retq
70777069
;
@@ -7106,24 +7098,16 @@ define <16 x i16> @shuffle_v16i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_1
71067098
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,2,3,4,5,14,15,8,9,10,11,12,13,22,23,16,17,18,19,20,21,30,31,24,25,26,27,28,29]
71077099
; AVX2-FAST-NEXT: retq
71087100
;
7109-
; AVX512VL-SLOW-LABEL: shuffle_v16i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
7110-
; AVX512VL-SLOW: # %bb.0:
7111-
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,2,4,5,6,7,11,8,9,10,12,13,14,15]
7112-
; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14]
7113-
; AVX512VL-SLOW-NEXT: retq
7114-
;
7115-
; AVX512VL-FAST-LABEL: shuffle_v16i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
7116-
; AVX512VL-FAST: # %bb.0:
7117-
; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,2,3,4,5,14,15,8,9,10,11,12,13,22,23,16,17,18,19,20,21,30,31,24,25,26,27,28,29]
7118-
; AVX512VL-FAST-NEXT: retq
7101+
; AVX512VL-LABEL: shuffle_v16i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
7102+
; AVX512VL: # %bb.0:
7103+
; AVX512VL-NEXT: vprolq $48, %ymm0, %ymm0
7104+
; AVX512VL-NEXT: retq
71197105
;
71207106
; XOPAVX1-LABEL: shuffle_v16i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
71217107
; XOPAVX1: # %bb.0:
7122-
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,0,1,2,4,5,6,7]
7123-
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,6]
7108+
; XOPAVX1-NEXT: vprotq $48, %xmm0, %xmm1
71247109
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
7125-
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
7126-
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6]
7110+
; XOPAVX1-NEXT: vprotq $48, %xmm0, %xmm0
71277111
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
71287112
; XOPAVX1-NEXT: retq
71297113
;

0 commit comments

Comments
 (0)