-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] getFauxShuffleMask - insert_subvector - skip undemanded subvectors #129042
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] getFauxShuffleMask - insert_subvector - skip undemanded subvectors #129042
Conversation
If the shuffle combine doesn't require the subvector of a insert_subvector node, we can just combine the base vector directly.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf the shuffle combine doesn't require the subvector of a insert_subvector node, we can just combine the base vector directly. Patch is 177.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129042.diff 11 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 63d66c66d94d2..12636f22d8409 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6145,6 +6145,13 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
uint64_t InsertIdx = N.getConstantOperandVal(2);
+ // Subvector isn't demanded - just return the base vector.
+ if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
+ Mask.resize(NumElts, SM_SentinelUndef);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ Ops.push_back(Src);
+ return true;
+ }
// Handle CONCAT(SUB0, SUB1).
// Limit this to vXi64 vector cases to make the most of cross lane shuffles.
if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index d3ac3f1f64c83..18ca01290c914 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -270,11 +270,10 @@ define <16 x i16> @insert_i16_firstelts(<16 x i16> %x, i16 %s) {
; AVX2-LABEL: insert_i16_firstelts:
; AVX2: # %bb.0:
; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm1
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vmovd %edi, %xmm1
-; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
%i0 = insertelement <16 x i16> %x, i16 %s, i32 0
%i1 = insertelement <16 x i16> %i0, i16 %s, i32 8
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 7ce37c637a79c..31e4f7e350a05 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -693,20 +693,18 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, ptr %ptr) nounwind {
; KNL-LABEL: insert_v16i16:
; KNL: ## %bb.0:
; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vmovd %edi, %xmm2
+; KNL-NEXT: vpbroadcastw %xmm2, %ymm2
+; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT: vmovd %edi, %xmm1
-; KNL-NEXT: vpbroadcastw %xmm1, %ymm1
-; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
-; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; KNL-NEXT: retq
;
; SKX-LABEL: insert_v16i16:
; SKX: ## %bb.0:
; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT: vpbroadcastw %edi, %ymm2
+; SKX-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vpbroadcastw %edi, %ymm1
-; SKX-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
-; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; SKX-NEXT: retq
%val = load i16, ptr %ptr
%r1 = insertelement <16 x i16> %x, i16 %val, i32 1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index d83a61e18d1ab..85ed61811af53 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -985,27 +985,27 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: vmovapd 160(%rdi), %ymm3
; AVX-NEXT: vmovapd 128(%rdi), %ymm4
-; AVX-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX-NEXT: vmovaps (%rdi), %ymm7
; AVX-NEXT: vmovaps 96(%rdi), %ymm0
; AVX-NEXT: vmovaps 64(%rdi), %ymm1
-; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm5
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm5[0,0],ymm0[6,4],ymm5[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,2],ymm2[6,4],ymm5[6,6]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm8, %xmm9
-; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1],xmm9[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
+; AVX-NEXT: vmovaps 32(%rdi), %ymm6
+; AVX-NEXT: vmovaps (%rdi), %ymm7
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm8
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm8[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,3]
+; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm9
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm9[0,0],ymm0[6,4],ymm9[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm1[2,2],ymm10[6,4],ymm1[6,6]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3,4,5,6,7]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[1],ymm10[3],ymm4[2]
; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,0],ymm5[2,3],ymm11[6,4],ymm5[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm9[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[1,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm8[1,3]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,0],ymm9[1,0],ymm0[7,4],ymm9[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm1[2,3],ymm8[6,4],ymm1[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm4[1,3],ymm10[7,5],ymm4[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
@@ -1953,162 +1953,161 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i32_stride6_vf16:
; AVX: # %bb.0:
-; AVX-NEXT: subq $328, %rsp # imm = 0x148
-; AVX-NEXT: vmovaps 224(%rdi), %ymm12
-; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 288(%rdi), %ymm10
-; AVX-NEXT: vmovaps 256(%rdi), %ymm4
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 160(%rdi), %ymm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 128(%rdi), %ymm13
-; AVX-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps (%rdi), %ymm14
+; AVX-NEXT: subq $264, %rsp # imm = 0x108
+; AVX-NEXT: vmovaps 224(%rdi), %ymm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 192(%rdi), %ymm10
+; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 160(%rdi), %ymm0
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 128(%rdi), %ymm7
; AVX-NEXT: vmovaps 96(%rdi), %ymm9
-; AVX-NEXT: vmovaps 64(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm2
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm9[2,0],ymm2[0,0],ymm9[6,4],ymm2[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm2[2,2],ymm5[6,4],ymm2[6,6]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5],ymm14[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0,1],xmm7[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm7[0,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm1[0,1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[1],ymm8[3],ymm13[2]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm11[6,7]
+; AVX-NEXT: vmovaps 64(%rdi), %ymm15
+; AVX-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps (%rdi), %ymm2
+; AVX-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm6[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,3]
+; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm15, %ymm8
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[2,0],ymm8[0,0],ymm9[6,4],ymm8[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm15[2,2],ymm4[6,4],ymm15[6,6]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT: vmovapd %ymm7, %ymm1
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[3],ymm1[2]
+; AVX-NEXT: vmovapd %ymm1, %ymm13
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm1
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm10[2,0],ymm1[0,0],ymm10[6,4],ymm1[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm1[2,2],ymm5[6,4],ymm1[6,6]
-; AVX-NEXT: vmovaps 192(%rdi), %ymm15
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1],xmm0[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7]
-; AVX-NEXT: vmovapd 352(%rdi), %ymm4
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm3[0,2],xmm0[0,3]
+; AVX-NEXT: vmovaps 288(%rdi), %ymm11
+; AVX-NEXT: vmovaps 256(%rdi), %ymm10
+; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm10, %ymm5
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm11[2,0],ymm5[0,0],ymm11[6,4],ymm5[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm10[2,2],ymm14[6,4],ymm10[6,6]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2],ymm14[3,4,5,6,7]
+; AVX-NEXT: vmovapd 352(%rdi), %ymm3
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 320(%rdi), %ymm12
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[0,1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm4[0],ymm12[1],ymm4[3],ymm12[2]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm2[2,3],ymm11[6,4],ymm2[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0],xmm7[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[1,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm13[1,3],ymm8[7,5],ymm13[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm3[0],ymm12[1],ymm3[3],ymm12[2]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm14[6,7]
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[1,3]
+; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm8[1,0],ymm9[7,4],ymm8[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm15[2,3],ymm4[6,4],ymm15[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm13[1,3],ymm7[7,5],ymm13[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,0],ymm1[1,0],ymm10[7,4],ymm1[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,0],xmm0[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm5[1,0],ymm11[7,4],ymm5[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,3],ymm1[6,4],ymm10[6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1],ymm12[1,3],ymm4[7,5],ymm12[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1],ymm12[1,3],ymm3[7,5],ymm12[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm7[2,0],ymm9[6,5],ymm7[6,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vblendps $12, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm15[2,0],ymm9[6,5],ymm15[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm5[2,3]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm7[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0],ymm4[2,0],ymm8[4,4],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload
-; AVX-NEXT: # ymm11 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,1],ymm3[2,0],ymm10[6,5],ymm3[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm3[2,0],ymm4[4,4],ymm3[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT: # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,1],ymm10[2,0],ymm11[6,5],ymm10[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm11, %xmm15
-; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,0],xmm15[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm14[2,0],ymm0[4,4],ymm14[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
+; AVX-NEXT: vextractf128 $1, %ymm8, %xmm14
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm14[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[3,1],ymm8[4,5],ymm4[7,5]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm5[3,3]
-; AVX-NEXT: vmovaps %ymm7, %ymm1
-; AVX-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm1[2,1],ymm7[7,5],ymm1[6,5]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[3,1],ymm4[4,5],ymm3[7,5]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[3,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,1],ymm15[2,1],ymm1[7,5],ymm15[6,5]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm14[3,1],ymm0[4,5],ymm14[7,5]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm11[3,1],xmm15[3,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5]
-; AVX-NEXT: vmovaps %ymm3, %ymm15
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm6[3,1],ymm0[4,5],ymm6[7,5]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm8[3,1],xmm14[3,3]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm10[2,1],ymm11[7,5],ymm10[6,5]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5,6,7]
; AVX-NEXT: vmovaps 32(%rdi), %xmm3
; AVX-NEX...
[truncated]
|
; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm1 | ||
; AVX2-NEXT: vmovd %edi, %xmm2 | ||
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 | ||
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why we do both vpinsrw + vmovd? can't it be:
vpinsrw $0, %edi, %xmm0, %xmm1 OR vmovd %edi, %xmm1
vpbroadcastw %xmm1, %ymm1
vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LowerINSERT_VECTOR_ELT is very limited on when it uses the broadcast+blend path - I'll see if there's a way to improve it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've raised #129056
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] | ||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The two vpblend seems regression compare with vinserti128?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
its avoiding the and mask load, which is great - rather annoying that we still create duplicate xor zero vector sequences instead of reusing them.....
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was reported back at #26392 - we've tried to resolve it repeatedly and never fixed it properly.
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] | ||
; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 | ||
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are more similar patterns.
…ors (llvm#129042) If the shuffle combine doesn't require the subvector of a insert_subvector node, we can just combine the base vector directly.
If the shuffle combine doesn't require the subvector of a insert_subvector node, we can just combine the base vector directly.