-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] lowerShuffleWithUndefHalf - don't split vXi8 unary shuffles if the 128-bit lanes are already in place #122919
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
…the 128-bit lanes are already in place Allows us to use PSHUFB to shuffle the lanes, and then perform a sub-lane permutation down to the lower half Fixes llvm#116815
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesAllows us to use PSHUFB to shuffle the lanes, and then perform a sub-lane permutation down to the lower half Fixes #116815 Patch is 1.21 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122919.diff 13 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d4152ff4a816c4..90e3e15b1fb46c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15672,12 +15672,16 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
(!isSingleSHUFPSMask(HalfMask) ||
Subtarget.hasFastVariableCrossLaneShuffle()))
return SDValue();
- // If this is a unary shuffle (assume that the 2nd operand is
+ // If this is an unary shuffle (assume that the 2nd operand is
// canonicalized to undef), then we can use vpermpd. Otherwise, we
// are better off extracting the upper half of 1 operand and using a
// narrow shuffle.
if (EltWidth == 64 && V2.isUndef())
return SDValue();
+ // If this is an unary vXi8 shuffle with inplace halves, then perform as
+ // full width pshufb, and then merge.
+ if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
+ return SDValue();
}
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
if (Subtarget.hasAVX512() && VT.is512BitVector())
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 9642e5e4c9f868..26af46263c0e2c 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -699,16 +699,13 @@ define <16 x i8> @evenelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwi
;
; AVX2-LABEL: evenelts_v32i16_shuffle_v16i16_to_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4]
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -783,16 +780,13 @@ define <16 x i8> @oddelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwin
;
; AVX2-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4]
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index f0f02f1ed890ae..ec442c185706cf 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -275,53 +275,45 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
+; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
+; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/trunc-vector-width.ll b/llvm/test/CodeGen/X86/trunc-vector-width.ll
index bc6969c5cd37a6..42cc624b5a5359 100644
--- a/llvm/test/CodeGen/X86/trunc-vector-width.ll
+++ b/llvm/test/CodeGen/X86/trunc-vector-width.ll
@@ -4,14 +4,16 @@
define void @test(ptr %a0) #0 {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqu (%rdi), %xmm0
-; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,5,5,0,0,1,1,u,u,u,u,u,u,u,u]
-; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovdqu (%rdi), %ymm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,0,0]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = mem[0],ymm0[1,2,3,4,5,6,7]
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; CHECK-NEXT: vpextrb $1, %xmm0, (%rax)
; CHECK-NEXT: vpextrb $4, %xmm0, (%rax)
; CHECK-NEXT: vpextrb $8, %xmm0, (%rax)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%load = load <64 x i8>, ptr %a0, align 1
%shuf = shufflevector <64 x i8> %load, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 01181d4b21d9d7..abef980277ecea 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -849,146 +849,122 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-LABEL: load_i8_stride4_vf16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
-; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4]
+; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm6
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5
+; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm5
; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX2-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX2-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX2-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX2-NEXT: vmovdqa %xmm3, (%rdx)
+; AVX2-NEXT: vmovdqa %xmm5, (%rcx)
; AVX2-NEXT: vmovdqa %xmm0, (%r8)
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i8_stride4_vf16:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4]
+; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX2-FP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm6
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm5
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-FP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx)
+; AVX2-FP-NEXT: vmovdqa %xmm5, (%rcx)
; AVX2-FP-NEXT: vmovdqa %xmm0, (%r8)
+; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i8_stride4_vf16:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-FCP-N...
[truncated]
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Allows us to use PSHUFB to shuffle the lanes, and then perform a sub-lane permutation down to the lower half
Fixes #116815