Skip to content

Commit d9f1166

Browse files
authored
[X86] Combine VPERMV3 to VPERMV for i8/i16 (#96414)
VPERM[I,T]2[B,W] are 3 uops on Skylake and Icelake so we try to use VPERMV.
1 parent 2ff22d7 commit d9f1166

File tree

3 files changed

+41
-26
lines changed

3 files changed

+41
-26
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41333,6 +41333,32 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4133341333

4133441334
return SDValue();
4133541335
}
41336+
case X86ISD::VPERMV3: {
41337+
// VPERM[I,T]2[B,W] are 3 uops on Skylake and Icelake so we try to use
41338+
// VPERMV.
41339+
SDValue V1 = N.getOperand(0);
41340+
SDValue V2 = N.getOperand(2);
41341+
MVT EVT = VT.getVectorElementType();
41342+
MVT NVT = VT.getDoubleNumVectorElementsVT();
41343+
if ((EVT == MVT::i8 || EVT == MVT::i16) &&
41344+
(NVT.is256BitVector() ||
41345+
(NVT.is512BitVector() && Subtarget.hasEVEX512())) &&
41346+
V1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41347+
V1.getConstantOperandVal(1) == 0 &&
41348+
V2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41349+
V2.getConstantOperandVal(1) == VT.getVectorNumElements() &&
41350+
V1.getOperand(0) == V2.getOperand(0)) {
41351+
SDValue Mask =
41352+
DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NVT, DAG.getUNDEF(NVT),
41353+
N.getOperand(1), DAG.getIntPtrConstant(0, DL));
41354+
return DAG.getNode(
41355+
ISD::EXTRACT_SUBVECTOR, DL, VT,
41356+
DAG.getNode(X86ISD::VPERMV, DL, NVT, Mask, V1.getOperand(0)),
41357+
DAG.getIntPtrConstant(0, DL));
41358+
}
41359+
41360+
return SDValue();
41361+
}
4133641362
default:
4133741363
return SDValue();
4133841364
}

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -393,9 +393,8 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <1
393393
define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
394394
; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
395395
; CHECK: # %bb.0:
396-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
397-
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
398-
; CHECK-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
396+
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
397+
; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
399398
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
400399
; CHECK-NEXT: retq
401400
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
@@ -404,11 +403,10 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
404403
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
405404
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
406405
; CHECK: # %bb.0:
407-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
408-
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
409-
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
406+
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
407+
; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
410408
; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
411-
; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
409+
; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
412410
; CHECK-NEXT: retq
413411
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
414412
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -419,10 +417,9 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16
419417
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
420418
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
421419
; CHECK: # %bb.0:
422-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
423-
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
420+
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
424421
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
425-
; CHECK-NEXT: vpermt2w %ymm2, %ymm3, %ymm0 {%k1} {z}
422+
; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
426423
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
427424
; CHECK-NEXT: retq
428425
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
@@ -477,8 +474,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i
477474
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
478475
; CHECK: # %bb.0:
479476
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
480-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
481-
; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
477+
; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
482478
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
483479
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
484480
; CHECK-NEXT: vzeroupper
@@ -493,9 +489,8 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x
493489
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
494490
; CHECK: # %bb.0:
495491
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
496-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
497492
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
498-
; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
493+
; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
499494
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
500495
; CHECK-NEXT: vzeroupper
501496
; CHECK-NEXT: retq
@@ -508,8 +503,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i
508503
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
509504
; CHECK: # %bb.0:
510505
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
511-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
512-
; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
506+
; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
513507
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
514508
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
515509
; CHECK-NEXT: vzeroupper
@@ -524,9 +518,8 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x
524518
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
525519
; CHECK: # %bb.0:
526520
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
527-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
528521
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
529-
; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
522+
; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
530523
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
531524
; CHECK-NEXT: vzeroupper
532525
; CHECK-NEXT: retq
@@ -539,8 +532,7 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
539532
; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
540533
; CHECK: # %bb.0:
541534
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30]
542-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
543-
; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0
535+
; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
544536
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
545537
; CHECK-NEXT: vzeroupper
546538
; CHECK-NEXT: retq
@@ -551,8 +543,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i
551543
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
552544
; CHECK: # %bb.0:
553545
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
554-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
555-
; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
546+
; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
556547
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
557548
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
558549
; CHECK-NEXT: vzeroupper
@@ -567,9 +558,8 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x
567558
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
568559
; CHECK: # %bb.0:
569560
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
570-
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
571561
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
572-
; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
562+
; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
573563
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
574564
; CHECK-NEXT: vzeroupper
575565
; CHECK-NEXT: retq

llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,8 +348,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
348348
; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
349349
; AVX512VBMIVL: # %bb.0:
350350
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
351-
; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
352-
; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
351+
; AVX512VBMIVL-NEXT: vpermb %zmm0, %zmm1, %zmm0
353352
; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
354353
; AVX512VBMIVL-NEXT: vzeroupper
355354
; AVX512VBMIVL-NEXT: retq

0 commit comments

Comments
 (0)