Skip to content

[X86] lowerShuffleWithUndefHalf - don't split vXi8 unary shuffles if the 128-bit lanes are already in place #122919

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15672,12 +15672,16 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
(!isSingleSHUFPSMask(HalfMask) ||
Subtarget.hasFastVariableCrossLaneShuffle()))
return SDValue();
// If this is a unary shuffle (assume that the 2nd operand is
// If this is an unary shuffle (assume that the 2nd operand is
// canonicalized to undef), then we can use vpermpd. Otherwise, we
// are better off extracting the upper half of 1 operand and using a
// narrow shuffle.
if (EltWidth == 64 && V2.isUndef())
return SDValue();
// If this is an unary vXi8 shuffle with inplace halves, then perform as
// full width pshufb, and then merge.
if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
return SDValue();
}
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
if (Subtarget.hasAVX512() && VT.is512BitVector())
Expand Down
34 changes: 14 additions & 20 deletions llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -699,16 +699,13 @@ define <16 x i8> @evenelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwi
;
; AVX2-LABEL: evenelts_v32i16_shuffle_v16i16_to_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4]
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
Expand Down Expand Up @@ -783,16 +780,13 @@ define <16 x i8> @oddelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwin
;
; AVX2-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4]
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
Expand Down
56 changes: 24 additions & 32 deletions llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -275,53 +275,45 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
Expand Down
12 changes: 7 additions & 5 deletions llvm/test/CodeGen/X86/trunc-vector-width.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@
define void @test(ptr %a0) #0 {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %xmm0
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,5,5,0,0,1,1,u,u,u,u,u,u,u,u]
; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqu (%rdi), %ymm0
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,0,0]
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = mem[0],ymm0[1,2,3,4,5,6,7]
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; CHECK-NEXT: vpextrb $1, %xmm0, (%rax)
; CHECK-NEXT: vpextrb $4, %xmm0, (%rax)
; CHECK-NEXT: vpextrb $8, %xmm0, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%load = load <64 x i8>, ptr %a0, align 1
%shuf = shufflevector <64 x i8> %load, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
Expand Down
Loading
Loading