Skip to content

[X86] combineConcatVectorOps - convert ISD::VECTOR_SHUFFLE concatenation to use combineConcatVectorOps recursion #130610

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 11, 2025

Conversation

RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Mar 10, 2025

Only concatenate ISD::VECTOR_SHUFFLE nodes if at least one operand is beneficial to concatenate

@llvmbot
Copy link
Member

llvmbot commented Mar 10, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Only concatenate ISD::VECTOR_SHUFFLE nodes if at least one operand is beneficial to concatenate


Patch is 35.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130610.diff

9 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+21-17)
  • (modified) llvm/test/CodeGen/X86/gfni-rotates.ll (+3-3)
  • (modified) llvm/test/CodeGen/X86/known-bits-vector.ll (+6-10)
  • (modified) llvm/test/CodeGen/X86/matrix-multiply.ll (+59-29)
  • (modified) llvm/test/CodeGen/X86/vector-fshr-rot-128.ll (+3-3)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll (+18-15)
  • (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll (+13-6)
  • (modified) llvm/test/CodeGen/X86/widen_bitcnt.ll (+106-150)
  • (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+38-18)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 22a8728451d9d..83057b7b2b906 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57971,24 +57971,28 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       // TODO: Relax VBMI requirement for repeated shuffle ops - currently
       // limited to targets that should always have good cross lane shuffles.
       if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
-          (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
-          (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1) ||
-           (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
-            Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
-            Subtarget.hasVBMI()))) {
-        int NumSubElts = Op0.getValueType().getVectorNumElements();
-        SmallVector<int> NewMask;
-        for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
-          M = M >= NumSubElts ? M + NumSubElts : M;
-          NewMask.push_back(M);
-        }
-        for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
-          if (0 <= M)
-            M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
-          NewMask.push_back(M);
+          (EltSizeInBits >= 32 || Subtarget.hasInt256())) {
+        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
+        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
+        if (Concat0 || Concat1 ||
+            (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
+             Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
+             Subtarget.hasVBMI())) {
+          int NumSubElts = Op0.getValueType().getVectorNumElements();
+          SmallVector<int> NewMask;
+          for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
+            M = M >= NumSubElts ? M + NumSubElts : M;
+            NewMask.push_back(M);
+          }
+          for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
+            if (0 <= M)
+              M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
+            NewMask.push_back(M);
+          }
+          Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
+          Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
+          return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
         }
-        return DAG.getVectorShuffle(VT, DL, ConcatSubOperand(VT, Ops, 0),
-                                    ConcatSubOperand(VT, Ops, 1), NewMask);
       }
       break;
     }
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 5fd4dfa7cc262..967f26f70946a 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -255,9 +255,9 @@ define <16 x i8> @splatvar_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: splatvar_rotr_v16i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; GFNIAVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; GFNIAVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; GFNIAVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; GFNIAVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; GFNIAVX512BW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; GFNIAVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll
index 2eef32eb61414..c2e1c92fddcf6 100644
--- a/llvm/test/CodeGen/X86/known-bits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-bits-vector.ll
@@ -384,23 +384,19 @@ declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
 define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; X86-LABEL: knownbits_mask_concat_uitofp:
 ; X86:       # %bb.0:
-; X86-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
-; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071]
-; X86-NEXT:    vandps %xmm2, %xmm1, %xmm1
-; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; X86-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,5,7,5,7]
 ; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: knownbits_mask_concat_uitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
-; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071]
-; X64-NEXT:    vandps %xmm2, %xmm1, %xmm1
-; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; X64-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,5,7,5,7]
 ; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; X64-NEXT:    retq
   %1 = and <4 x i32> %a0, <i32 131071, i32 -1, i32 131071, i32 -1>
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 1ee03c5f1223f..d723ec849f328 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -974,35 +974,65 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin
 ; SSE-NEXT:    movaps %xmm5, %xmm2
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: test_mul4x4_f32:
-; AVX1OR2:       # %bb.0: # %entry
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
-; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
-; AVX1OR2-NEXT:    vmulps %ymm4, %ymm5, %ymm4
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm6
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,0,0,0,4,4,4,4]
-; AVX1OR2-NEXT:    vmulps %ymm0, %ymm6, %ymm0
-; AVX1OR2-NEXT:    vaddps %ymm4, %ymm0, %ymm0
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm4
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6]
-; AVX1OR2-NEXT:    vmulps %ymm7, %ymm4, %ymm7
-; AVX1OR2-NEXT:    vaddps %ymm7, %ymm0, %ymm0
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
-; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1OR2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; AVX1OR2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
-; AVX1OR2-NEXT:    vmulps %ymm2, %ymm5, %ymm2
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
-; AVX1OR2-NEXT:    vmulps %ymm5, %ymm6, %ymm5
-; AVX1OR2-NEXT:    vaddps %ymm2, %ymm5, %ymm2
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6]
-; AVX1OR2-NEXT:    vmulps %ymm5, %ymm4, %ymm4
-; AVX1OR2-NEXT:    vaddps %ymm4, %ymm2, %ymm2
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
-; AVX1OR2-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; AVX1OR2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: test_mul4x4_f32:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vmulps %ymm4, %ymm5, %ymm4
+; AVX1-NEXT:    vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm7
+; AVX1-NEXT:    vmulps %ymm6, %ymm7, %ymm0
+; AVX1-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm6
+; AVX1-NEXT:    vmulps %ymm4, %ymm6, %ymm4
+; AVX1-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX1-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
+; AVX1-NEXT:    vmulps %ymm2, %ymm5, %ymm2
+; AVX1-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vmulps %ymm4, %ymm7, %ymm4
+; AVX1-NEXT:    vaddps %ymm2, %ymm4, %ymm2
+; AVX1-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
+; AVX1-NEXT:    vmulps %ymm4, %ymm6, %ymm4
+; AVX1-NEXT:    vaddps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
+; AVX1-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_mul4x4_f32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vmulps %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vmulps %ymm6, %ymm7, %ymm0
+; AVX2-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm1[0,1,0,1]
+; AVX2-NEXT:    vmulps %ymm4, %ymm6, %ymm4
+; AVX2-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vmulps %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmulps %ymm4, %ymm7, %ymm4
+; AVX2-NEXT:    vaddps %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
+; AVX2-NEXT:    vmulps %ymm4, %ymm6, %ymm4
+; AVX2-NEXT:    vaddps %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_mul4x4_f32:
 ; AVX512:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index f45405d885377..9ce682306f18b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1194,9 +1194,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512VLBW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index ba1621c67f480..7f2210742e7f1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -671,15 +671,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqa %xmm0, 16(%rcx)
-; AVX2-NEXT:    vmovdqa %xmm1, (%rcx)
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %xmm2, 32(%rcx)
+; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: store_i8_stride3_vf16:
@@ -693,15 +694,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vmovdqa %xmm0, 16(%rcx)
-; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rcx)
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, 32(%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: store_i8_stride3_vf16:
@@ -715,15 +717,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vmovdqa %xmm0, 16(%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %xmm1, (%rcx)
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: store_i8_stride3_vf16:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index d6208aca3b2b7..fb8618be17f06 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1682,12 +1682,19 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) {
 }
 
 define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; AVX1OR2-LABEL: shuffle_v4i64_1032_v2i64:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: shuffle_v4i64_1032_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_1032_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT:    retq
 ;
 ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1032_v2i64:
 ; AVX512VL-SLOW:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll
index cca9d4aa2a9f0..56001468898e4 100644
--- a/llvm/test/CodeGen/X86/widen_bitcnt.ll
+++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll
@@ -241,77 +241,59 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32
 ;
 ; AVX2-LABEL: widen_ctpop_v2i32_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
-; AVX2-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX2-NEXT:    vpsrlw $4, %xmm1, %xmm7
-; AVX2-NEXT:    vpand %xmm5, %xmm7, %xmm7
-; AVX2-NEXT:    vpshufb %xmm7, %xmm6, %xmm7
-; AVX2-NEXT:    vpand %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX2-NEXT:    vpsrlw $4, %xmm2, %xmm8
-; AVX2-NEXT:    vpand %xmm5, %xmm8, %xmm8
-; AVX2-NEXT:    vpshufb %xmm8, %xmm6, %xmm8
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0]
-; AVX2-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vpaddb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpsrlw $4, %xmm3, %xmm2
-; AVX2-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm2[0]
-; AVX2-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpsadbw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vpsadbw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpaddb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
+; AVX2-NEXT:    vpsadbw %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
+; AVX2-NEXT:    vpsadbw %ym...
[truncated]

@RKSimon RKSimon force-pushed the x86-concat-shuffles branch from 22b00be to 48196e9 Compare March 11, 2025 11:54
…ion to use combineConcatVectorOps recursion

Only concatenate ISD::VECTOR_SHUFFLE nodes if at least one operand is beneficial to concatenate
@RKSimon RKSimon force-pushed the x86-concat-shuffles branch from 48196e9 to 1548a2d Compare March 11, 2025 12:08
@RKSimon RKSimon merged commit 7c77a46 into llvm:main Mar 11, 2025
9 of 10 checks passed
@RKSimon RKSimon deleted the x86-concat-shuffles branch March 11, 2025 14:37
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants