Skip to content

Commit 8ac00ca

Browse files
authored
[X86] lowerShuffleWithUndefHalf - don't split vXi8 unary shuffles if the 128-bit source lanes are already in place (#122919)
Allows us to use PSHUFB to shuffle the lanes, and then perform a sub-lane permutation down to the lower half Fixes #116815
1 parent 929eb50 commit 8ac00ca

13 files changed

+7061
-8493
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15672,12 +15672,16 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
1567215672
(!isSingleSHUFPSMask(HalfMask) ||
1567315673
Subtarget.hasFastVariableCrossLaneShuffle()))
1567415674
return SDValue();
15675-
// If this is a unary shuffle (assume that the 2nd operand is
15675+
// If this is an unary shuffle (assume that the 2nd operand is
1567615676
// canonicalized to undef), then we can use vpermpd. Otherwise, we
1567715677
// are better off extracting the upper half of 1 operand and using a
1567815678
// narrow shuffle.
1567915679
if (EltWidth == 64 && V2.isUndef())
1568015680
return SDValue();
15681+
// If this is an unary vXi8 shuffle with inplace halves, then perform as
15682+
// full width pshufb, and then merge.
15683+
if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
15684+
return SDValue();
1568115685
}
1568215686
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
1568315687
if (Subtarget.hasAVX512() && VT.is512BitVector())

llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -699,16 +699,13 @@ define <16 x i8> @evenelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwi
699699
;
700700
; AVX2-LABEL: evenelts_v32i16_shuffle_v16i16_to_v16i8:
701701
; AVX2: # %bb.0:
702-
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
703-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
704-
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
705-
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
706-
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
707-
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
708-
; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
709-
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
710-
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
711-
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
702+
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
703+
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
704+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
705+
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
706+
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
707+
; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4]
708+
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
712709
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
713710
; AVX2-NEXT: vzeroupper
714711
; AVX2-NEXT: retq
@@ -783,16 +780,13 @@ define <16 x i8> @oddelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwin
783780
;
784781
; AVX2-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
785782
; AVX2: # %bb.0:
786-
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
787-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
788-
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
789-
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
790-
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
791-
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
792-
; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
793-
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
794-
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
795-
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
783+
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
784+
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
785+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
786+
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
787+
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
788+
; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4]
789+
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
796790
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
797791
; AVX2-NEXT: vzeroupper
798792
; AVX2-NEXT: retq

llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll

Lines changed: 24 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -275,53 +275,45 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
275275
define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
276276
; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
277277
; AVX512F: # %bb.0:
278-
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
279-
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
280-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
281-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
282-
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
283-
; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0
284-
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
285-
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
278+
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
279+
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
280+
; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
281+
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
282+
; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
283+
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
286284
; AVX512F-NEXT: vzeroupper
287285
; AVX512F-NEXT: retq
288286
;
289287
; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
290288
; AVX512VL: # %bb.0:
291-
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
292-
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
293-
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
294-
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
295-
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
296-
; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0
297-
; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
298-
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
289+
; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
290+
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
291+
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
292+
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
293+
; AVX512VL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
294+
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
299295
; AVX512VL-NEXT: vzeroupper
300296
; AVX512VL-NEXT: retq
301297
;
302298
; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
303299
; AVX512BW: # %bb.0:
304-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
305-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
306-
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
307-
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
308-
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
309-
; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0
310-
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
311-
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
300+
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
301+
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2
302+
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
303+
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
304+
; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
305+
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
312306
; AVX512BW-NEXT: vzeroupper
313307
; AVX512BW-NEXT: retq
314308
;
315309
; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
316310
; AVX512BWVL: # %bb.0:
317-
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
318-
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
319-
; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
320-
; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
321-
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
322-
; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0
323-
; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
324-
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
311+
; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
312+
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
313+
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
314+
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
315+
; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
316+
; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
325317
; AVX512BWVL-NEXT: vzeroupper
326318
; AVX512BWVL-NEXT: retq
327319
;

llvm/test/CodeGen/X86/trunc-vector-width.ll

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@
44
define void @test(ptr %a0) #0 {
55
; CHECK-LABEL: test:
66
; CHECK: # %bb.0:
7-
; CHECK-NEXT: vmovdqu (%rdi), %xmm0
8-
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
9-
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,5,5,0,0,1,1,u,u,u,u,u,u,u,u]
10-
; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
11-
; CHECK-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
7+
; CHECK-NEXT: vmovdqu (%rdi), %ymm0
8+
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,0,0]
9+
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = mem[0],ymm0[1,2,3,4,5,6,7]
10+
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
11+
; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
12+
; CHECK-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
1213
; CHECK-NEXT: vpextrb $1, %xmm0, (%rax)
1314
; CHECK-NEXT: vpextrb $4, %xmm0, (%rax)
1415
; CHECK-NEXT: vpextrb $8, %xmm0, (%rax)
16+
; CHECK-NEXT: vzeroupper
1517
; CHECK-NEXT: retq
1618
%load = load <64 x i8>, ptr %a0, align 1
1719
%shuf = shufflevector <64 x i8> %load, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>

0 commit comments

Comments
 (0)