Skip to content

Commit 618a890

Browse files
committed
[X86] Increase the depth threshold required to form VPERMI2W/VPERMI2B in shuffle combining
These instructions are implemented with two port 5 uops and one port 015 uop so they are more complicated that most shuffles. This patch increases the depth threshold for when we form them during shuffle combining to try to limit increasing the number of uops especially on port 5. Differential Revision: https://reviews.llvm.org/D88503
1 parent 0a146a9 commit 618a890

File tree

6 files changed

+71
-188
lines changed

6 files changed

+71
-188
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35351,6 +35351,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3535135351
// Depth threshold above which we can efficiently use variable mask shuffles.
3535235352
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
3535335353
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
35354+
// VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
35355+
// higher depth before combining them.
35356+
bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
3535435357

3535535358
bool MaskContainsZeros = isAnyZero(Mask);
3535635359

@@ -35387,9 +35390,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3538735390
MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
3538835391
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
3538935392
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
35390-
(Subtarget.hasBWI() &&
35393+
(Subtarget.hasBWI() && AllowBWIVPERMV3 &&
3539135394
(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
35392-
(Subtarget.hasVBMI() &&
35395+
(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
3539335396
(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
3539435397
// Adjust shuffle mask - replace SM_SentinelZero with second source index.
3539535398
for (unsigned i = 0; i != NumMaskElts; ++i)
@@ -35416,9 +35419,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3541635419
MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
3541735420
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
3541835421
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
35419-
(Subtarget.hasBWI() &&
35422+
(Subtarget.hasBWI() && AllowBWIVPERMV3 &&
3542035423
(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
35421-
(Subtarget.hasVBMI() &&
35424+
(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
3542235425
(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
3542335426
V1 = DAG.getBitcast(MaskVT, V1);
3542435427
V2 = DAG.getBitcast(MaskVT, V2);
@@ -35588,10 +35591,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3558835591
MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
3558935592
MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
3559035593
MaskVT == MVT::v16i32)) ||
35591-
(Subtarget.hasBWI() && (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
35592-
MaskVT == MVT::v32i16)) ||
35593-
(Subtarget.hasVBMI() && (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
35594-
MaskVT == MVT::v64i8)))) {
35594+
(Subtarget.hasBWI() && AllowBWIVPERMV3 &&
35595+
(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
35596+
(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
35597+
(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
3559535598
V1 = DAG.getBitcast(MaskVT, V1);
3559635599
V2 = DAG.getBitcast(MaskVT, V2);
3559735600
Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

llvm/test/CodeGen/X86/min-legal-vector-width.ll

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -857,10 +857,10 @@ define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-wi
857857
define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
858858
; CHECK-LABEL: trunc_v8i64_v8i32_zeroes:
859859
; CHECK: # %bb.0:
860-
; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm1
861-
; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm2
862-
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
863-
; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
860+
; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0
861+
; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1
862+
; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0
863+
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
864864
; CHECK-NEXT: retq
865865
%a = load <8 x i64>, <8 x i64>* %x
866866
%b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
@@ -920,9 +920,10 @@ define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vect
920920
define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
921921
; CHECK-LABEL: trunc_v16i32_v16i16_sign:
922922
; CHECK: # %bb.0:
923-
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
924-
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
925-
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
923+
; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0
924+
; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1
925+
; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0
926+
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
926927
; CHECK-NEXT: retq
927928
%a = load <16 x i32>, <16 x i32>* %x
928929
%b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -931,20 +932,13 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-
931932
}
932933

933934
define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
934-
; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign:
935-
; CHECK-AVX512: # %bb.0:
936-
; CHECK-AVX512-NEXT: vpsraw $8, 32(%rdi), %ymm0
937-
; CHECK-AVX512-NEXT: vpsraw $8, (%rdi), %ymm1
938-
; CHECK-AVX512-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
939-
; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
940-
; CHECK-AVX512-NEXT: retq
941-
;
942-
; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign:
943-
; CHECK-VBMI: # %bb.0:
944-
; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
945-
; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
946-
; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
947-
; CHECK-VBMI-NEXT: retq
935+
; CHECK-LABEL: trunc_v32i16_v32i8_sign:
936+
; CHECK: # %bb.0:
937+
; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0
938+
; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1
939+
; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
940+
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
941+
; CHECK-NEXT: retq
948942
%a = load <32 x i16>, <32 x i16>* %x
949943
%b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
950944
%c = trunc <32 x i16> %b to <32 x i8>

llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll

Lines changed: 10 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -304,24 +304,11 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
304304
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
305305
; AVX1-NEXT: retq
306306
;
307-
; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
308-
; AVX2: # %bb.0:
309-
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
310-
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
311-
; AVX2-NEXT: retq
312-
;
313-
; AVX512VLBW-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
314-
; AVX512VLBW: # %bb.0:
315-
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
316-
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
317-
; AVX512VLBW-NEXT: retq
318-
;
319-
; AVX512VLVBMI-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
320-
; AVX512VLVBMI: # %bb.0:
321-
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,0,17,0,18,0,19,0,20,0,21,0,22,0,23]
322-
; AVX512VLVBMI-NEXT: vpermi2b %xmm0, %xmm1, %xmm2
323-
; AVX512VLVBMI-NEXT: vmovdqa %xmm2, %xmm0
324-
; AVX512VLVBMI-NEXT: retq
307+
; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
308+
; AVX2OR512VL: # %bb.0:
309+
; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
310+
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
311+
; AVX2OR512VL-NEXT: retq
325312
;
326313
; XOPAVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
327314
; XOPAVX1: # %bb.0:
@@ -1335,23 +1322,11 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(
13351322
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
13361323
; SSE-NEXT: retq
13371324
;
1338-
; AVX1OR2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1339-
; AVX1OR2: # %bb.0:
1340-
; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1341-
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1342-
; AVX1OR2-NEXT: retq
1343-
;
1344-
; AVX512VLBW-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1345-
; AVX512VLBW: # %bb.0:
1346-
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1347-
; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1348-
; AVX512VLBW-NEXT: retq
1349-
;
1350-
; AVX512VLVBMI-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1351-
; AVX512VLVBMI: # %bb.0:
1352-
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,1,17,4,20,5,21,2,18,3,19,6,22,7,23]
1353-
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
1354-
; AVX512VLVBMI-NEXT: retq
1325+
; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1326+
; AVX: # %bb.0:
1327+
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1328+
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1329+
; AVX-NEXT: retq
13551330
%shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23>
13561331
ret <16 x i8> %shuffle
13571332
}

llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll

Lines changed: 25 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,23 +1017,11 @@ define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
10171017
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
10181018
; SSE-NEXT: retq
10191019
;
1020-
; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f:
1021-
; AVX1OR2: # %bb.0:
1022-
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1023-
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1024-
; AVX1OR2-NEXT: retq
1025-
;
1026-
; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f:
1027-
; AVX512VL-SLOW: # %bb.0:
1028-
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1029-
; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1030-
; AVX512VL-SLOW-NEXT: retq
1031-
;
1032-
; AVX512VL-FAST-LABEL: shuffle_v8i16_0c1d2e3f:
1033-
; AVX512VL-FAST: # %bb.0:
1034-
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,2,14,3,15]
1035-
; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
1036-
; AVX512VL-FAST-NEXT: retq
1020+
; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
1021+
; AVX: # %bb.0:
1022+
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1023+
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1024+
; AVX-NEXT: retq
10371025
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
10381026
ret <8 x i16> %shuffle
10391027
}
@@ -1059,23 +1047,11 @@ define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
10591047
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
10601048
; SSE-NEXT: retq
10611049
;
1062-
; AVX1OR2-LABEL: shuffle_v8i16_48596a7b:
1063-
; AVX1OR2: # %bb.0:
1064-
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1065-
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1066-
; AVX1OR2-NEXT: retq
1067-
;
1068-
; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b:
1069-
; AVX512VL-SLOW: # %bb.0:
1070-
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1071-
; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1072-
; AVX512VL-SLOW-NEXT: retq
1073-
;
1074-
; AVX512VL-FAST-LABEL: shuffle_v8i16_48596a7b:
1075-
; AVX512VL-FAST: # %bb.0:
1076-
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8,5,9,6,10,7,11]
1077-
; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
1078-
; AVX512VL-FAST-NEXT: retq
1050+
; AVX-LABEL: shuffle_v8i16_48596a7b:
1051+
; AVX: # %bb.0:
1052+
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1053+
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1054+
; AVX-NEXT: retq
10791055
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
10801056
ret <8 x i16> %shuffle
10811057
}
@@ -1424,23 +1400,11 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
14241400
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
14251401
; SSE41-NEXT: retq
14261402
;
1427-
; AVX1OR2-LABEL: shuffle_v8i16_012dXXXX:
1428-
; AVX1OR2: # %bb.0:
1429-
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1430-
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
1431-
; AVX1OR2-NEXT: retq
1432-
;
1433-
; AVX512VL-SLOW-LABEL: shuffle_v8i16_012dXXXX:
1434-
; AVX512VL-SLOW: # %bb.0:
1435-
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1436-
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
1437-
; AVX512VL-SLOW-NEXT: retq
1438-
;
1439-
; AVX512VL-FAST-LABEL: shuffle_v8i16_012dXXXX:
1440-
; AVX512VL-FAST: # %bb.0:
1441-
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,4,5,6,7]
1442-
; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
1443-
; AVX512VL-FAST-NEXT: retq
1403+
; AVX-LABEL: shuffle_v8i16_012dXXXX:
1404+
; AVX: # %bb.0:
1405+
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1406+
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
1407+
; AVX-NEXT: retq
14441408
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
14451409
ret <8 x i16> %shuffle
14461410
}
@@ -1475,24 +1439,11 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
14751439
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
14761440
; AVX1-NEXT: retq
14771441
;
1478-
; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
1479-
; AVX2: # %bb.0:
1480-
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
1481-
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
1482-
; AVX2-NEXT: retq
1483-
;
1484-
; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXXXcde3:
1485-
; AVX512VL-SLOW: # %bb.0:
1486-
; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
1487-
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
1488-
; AVX512VL-SLOW-NEXT: retq
1489-
;
1490-
; AVX512VL-FAST-LABEL: shuffle_v8i16_XXXXcde3:
1491-
; AVX512VL-FAST: # %bb.0:
1492-
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,11]
1493-
; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
1494-
; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
1495-
; AVX512VL-FAST-NEXT: retq
1442+
; AVX2OR512VL-LABEL: shuffle_v8i16_XXXXcde3:
1443+
; AVX2OR512VL: # %bb.0:
1444+
; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0
1445+
; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
1446+
; AVX2OR512VL-NEXT: retq
14961447
;
14971448
; XOPAVX1-LABEL: shuffle_v8i16_XXXXcde3:
14981449
; XOPAVX1: # %bb.0:
@@ -1533,24 +1484,11 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
15331484
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
15341485
; SSE41-NEXT: retq
15351486
;
1536-
; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX:
1537-
; AVX1OR2: # %bb.0:
1538-
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1539-
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
1540-
; AVX1OR2-NEXT: retq
1541-
;
1542-
; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX:
1543-
; AVX512VL-SLOW: # %bb.0:
1544-
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1545-
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
1546-
; AVX512VL-SLOW-NEXT: retq
1547-
;
1548-
; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX:
1549-
; AVX512VL-FAST: # %bb.0:
1550-
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,4,5,6,7]
1551-
; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
1552-
; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
1553-
; AVX512VL-FAST-NEXT: retq
1487+
; AVX-LABEL: shuffle_v8i16_cde3XXXX:
1488+
; AVX: # %bb.0:
1489+
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1490+
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
1491+
; AVX-NEXT: retq
15541492
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
15551493
ret <8 x i16> %shuffle
15561494
}

llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4804,29 +4804,11 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
48044804
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
48054805
; AVX1-NEXT: retq
48064806
;
4807-
; AVX2-LABEL: PR28136:
4808-
; AVX2: # %bb.0:
4809-
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
4810-
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4811-
; AVX2-NEXT: retq
4812-
;
4813-
; AVX512VLBW-LABEL: PR28136:
4814-
; AVX512VLBW: # %bb.0:
4815-
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
4816-
; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4817-
; AVX512VLBW-NEXT: retq
4818-
;
4819-
; AVX512VLVBMI-SLOW-LABEL: PR28136:
4820-
; AVX512VLVBMI-SLOW: # %bb.0:
4821-
; AVX512VLVBMI-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
4822-
; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4823-
; AVX512VLVBMI-SLOW-NEXT: retq
4824-
;
4825-
; AVX512VLVBMI-FAST-LABEL: PR28136:
4826-
; AVX512VLVBMI-FAST: # %bb.0:
4827-
; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,16,48,17,49,18,50,19,51,4,36,5,37,6,38,7,39,20,52,21,53,22,54,23,55]
4828-
; AVX512VLVBMI-FAST-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
4829-
; AVX512VLVBMI-FAST-NEXT: retq
4807+
; AVX2OR512VL-LABEL: PR28136:
4808+
; AVX2OR512VL: # %bb.0:
4809+
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
4810+
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4811+
; AVX2OR512VL-NEXT: retq
48304812
;
48314813
; XOPAVX1-LABEL: PR28136:
48324814
; XOPAVX1: # %bb.0:

llvm/test/CodeGen/X86/vector-zext.ll

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1902,20 +1902,11 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
19021902
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
19031903
; AVX2-NEXT: retq
19041904
;
1905-
; AVX512F-LABEL: shuf_zext_8i16_to_4i64_offset2:
1906-
; AVX512F: # %bb.0: # %entry
1907-
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
1908-
; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1909-
; AVX512F-NEXT: retq
1910-
;
1911-
; AVX512BW-LABEL: shuf_zext_8i16_to_4i64_offset2:
1912-
; AVX512BW: # %bb.0: # %entry
1913-
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1914-
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,33,34,35,3,37,38,39,4,41,42,43,5,45,46,47]
1915-
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1916-
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0
1917-
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1918-
; AVX512BW-NEXT: retq
1905+
; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2:
1906+
; AVX512: # %bb.0: # %entry
1907+
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
1908+
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1909+
; AVX512-NEXT: retq
19191910
entry:
19201911
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
19211912
%Z = bitcast <16 x i16> %B to <4 x i64>

0 commit comments

Comments
 (0)