-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] combineConcatVectorOps - convert ISD::VECTOR_SHUFFLE concatenation to use combineConcatVectorOps recursion #130610
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesOnly concatenate ISD::VECTOR_SHUFFLE nodes if at least one operand is beneficial to concatenate Patch is 35.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130610.diff 9 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 22a8728451d9d..83057b7b2b906 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57971,24 +57971,28 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
// TODO: Relax VBMI requirement for repeated shuffle ops - currently
// limited to targets that should always have good cross lane shuffles.
if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
- (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
- (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1) ||
- (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
- Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
- Subtarget.hasVBMI()))) {
- int NumSubElts = Op0.getValueType().getVectorNumElements();
- SmallVector<int> NewMask;
- for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
- M = M >= NumSubElts ? M + NumSubElts : M;
- NewMask.push_back(M);
- }
- for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
- if (0 <= M)
- M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
- NewMask.push_back(M);
+ (EltSizeInBits >= 32 || Subtarget.hasInt256())) {
+ SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
+ SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
+ if (Concat0 || Concat1 ||
+ (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
+ Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
+ Subtarget.hasVBMI())) {
+ int NumSubElts = Op0.getValueType().getVectorNumElements();
+ SmallVector<int> NewMask;
+ for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
+ M = M >= NumSubElts ? M + NumSubElts : M;
+ NewMask.push_back(M);
+ }
+ for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
+ if (0 <= M)
+ M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
+ NewMask.push_back(M);
+ }
+ Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
+ Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
+ return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
}
- return DAG.getVectorShuffle(VT, DL, ConcatSubOperand(VT, Ops, 0),
- ConcatSubOperand(VT, Ops, 1), NewMask);
}
break;
}
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 5fd4dfa7cc262..967f26f70946a 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -255,9 +255,9 @@ define <16 x i8> @splatvar_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
;
; GFNIAVX512BW-LABEL: splatvar_rotr_v16i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNIAVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; GFNIAVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; GFNIAVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll
index 2eef32eb61414..c2e1c92fddcf6 100644
--- a/llvm/test/CodeGen/X86/known-bits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-bits-vector.ll
@@ -384,23 +384,19 @@ declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; X86-LABEL: knownbits_mask_concat_uitofp:
; X86: # %bb.0:
-; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
-; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071]
-; X86-NEXT: vandps %xmm2, %xmm1, %xmm1
-; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; X86-NEXT: vandps %xmm2, %xmm0, %xmm0
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,5,7,5,7]
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: knownbits_mask_concat_uitofp:
; X64: # %bb.0:
-; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
-; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071]
-; X64-NEXT: vandps %xmm2, %xmm1, %xmm1
-; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; X64-NEXT: vandps %xmm2, %xmm0, %xmm0
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,5,7,5,7]
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 131071, i32 -1, i32 131071, i32 -1>
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 1ee03c5f1223f..d723ec849f328 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -974,35 +974,65 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin
; SSE-NEXT: movaps %xmm5, %xmm2
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: test_mul4x4_f32:
-; AVX1OR2: # %bb.0: # %entry
-; AVX1OR2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
-; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
-; AVX1OR2-NEXT: vmulps %ymm4, %ymm5, %ymm4
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6
-; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0,0,0,4,4,4,4]
-; AVX1OR2-NEXT: vmulps %ymm0, %ymm6, %ymm0
-; AVX1OR2-NEXT: vaddps %ymm4, %ymm0, %ymm0
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4
-; AVX1OR2-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6]
-; AVX1OR2-NEXT: vmulps %ymm7, %ymm4, %ymm7
-; AVX1OR2-NEXT: vaddps %ymm7, %ymm0, %ymm0
-; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
-; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1OR2-NEXT: vmulps %ymm2, %ymm1, %ymm2
-; AVX1OR2-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
-; AVX1OR2-NEXT: vmulps %ymm2, %ymm5, %ymm2
-; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
-; AVX1OR2-NEXT: vmulps %ymm5, %ymm6, %ymm5
-; AVX1OR2-NEXT: vaddps %ymm2, %ymm5, %ymm2
-; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6]
-; AVX1OR2-NEXT: vmulps %ymm5, %ymm4, %ymm4
-; AVX1OR2-NEXT: vaddps %ymm4, %ymm2, %ymm2
-; AVX1OR2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
-; AVX1OR2-NEXT: vmulps %ymm3, %ymm1, %ymm1
-; AVX1OR2-NEXT: vaddps %ymm1, %ymm2, %ymm1
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: test_mul4x4_f32:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
+; AVX1-NEXT: vmulps %ymm4, %ymm5, %ymm4
+; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7
+; AVX1-NEXT: vmulps %ymm6, %ymm7, %ymm0
+; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm6
+; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4
+; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX1-NEXT: vmulps %ymm2, %ymm1, %ymm2
+; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
+; AVX1-NEXT: vmulps %ymm2, %ymm5, %ymm2
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vmulps %ymm4, %ymm7, %ymm4
+; AVX1-NEXT: vaddps %ymm2, %ymm4, %ymm2
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
+; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4
+; AVX1-NEXT: vaddps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
+; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mul4x4_f32:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,3,2,3]
+; AVX2-NEXT: vmulps %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,0,1]
+; AVX2-NEXT: vmulps %ymm6, %ymm7, %ymm0
+; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,1,0,1]
+; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4
+; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX2-NEXT: vmulps %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
+; AVX2-NEXT: vmulps %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmulps %ymm4, %ymm7, %ymm4
+; AVX2-NEXT: vaddps %ymm2, %ymm4, %ymm2
+; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
+; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4
+; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
+; AVX2-NEXT: vmulps %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_mul4x4_f32:
; AVX512: # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index f45405d885377..9ce682306f18b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1194,9 +1194,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index ba1621c67f480..7f2210742e7f1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -671,15 +671,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX2-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: store_i8_stride3_vf16:
@@ -693,15 +694,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-FP-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX2-FP-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: store_i8_stride3_vf16:
@@ -715,15 +717,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: store_i8_stride3_vf16:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index d6208aca3b2b7..fb8618be17f06 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1682,12 +1682,19 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) {
}
define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; AVX1OR2-LABEL: shuffle_v4i64_1032_v2i64:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: shuffle_v4i64_1032_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_1032_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v4i64_1032_v2i64:
; AVX512VL-SLOW: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll
index cca9d4aa2a9f0..56001468898e4 100644
--- a/llvm/test/CodeGen/X86/widen_bitcnt.ll
+++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll
@@ -241,77 +241,59 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32
;
; AVX2-LABEL: widen_ctpop_v2i32_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT: vpand %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm7
-; AVX2-NEXT: vpand %xmm5, %xmm7, %xmm7
-; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm7
-; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1
-; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm8
-; AVX2-NEXT: vpand %xmm5, %xmm8, %xmm8
-; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm8
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0]
-; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vpaddb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm2
-; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm2[0]
-; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpaddb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
+; AVX2-NEXT: vpsadbw %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
+; AVX2-NEXT: vpsadbw %ym...
[truncated]
|
22b00be
to
48196e9
Compare
…ion to use combineConcatVectorOps recursion Only concatenate ISD::VECTOR_SHUFFLE nodes if at least one operand is beneficial to concatenate
48196e9
to
1548a2d
Compare
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Only concatenate ISD::VECTOR_SHUFFLE nodes if at least one operand is beneficial to concatenate