Skip to content

[X86] combineConcatVectorOps - attempt to recursively call combineConcatVectorOps from inside ConcatSubOperand. #131303

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 14, 2025

Conversation

RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Mar 14, 2025

Before falling back to creating a generic ISD::CONCAT_VECTORS node, see if we can directly concat the subvectors if we peek through any bitcasts.

…catVectorOps from inside ConcatSubOperand.

Before falling back to creating a generic ISD::CONCAT_VECTORS node, see if we can directly concat the subvectors if we peek through any bitcasts.
@llvmbot
Copy link
Member

llvmbot commented Mar 14, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Before falling back to creating a generic ISD::CONCAT_VECTORS node, see if we can directly concat the subvectors if we peek through any bitcasts.


Patch is 293.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131303.diff

6 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+5-2)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll (+284-284)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+1214-1224)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+6-6)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll (+106-114)
  • (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll (+6-6)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3360f199ccfc4..ac39c8fc0cba3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57937,11 +57937,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       // Attempt to peek through bitcasts and concat the original subvectors.
       EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
       if (SubVT.isSimple() && SubVT.isVector()) {
-        EVT ConcatVT =
-            EVT::getVectorVT(Ctx, SubVT.getScalarType(),
+        MVT ConcatVT =
+            MVT::getVectorVT(SubVT.getSimpleVT().getScalarType(),
                              SubVT.getVectorElementCount() * Subs.size());
         for (SDValue &Sub : Subs)
           Sub = DAG.getBitcast(SubVT, Sub);
+        if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
+                                                       Subtarget, Depth + 1))
+          return DAG.getBitcast(VT, ConcatSrc);
         return DAG.getBitcast(
             VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
       }
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 77e7397e2fc96..44bd9c03c7ce7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -1653,305 +1653,305 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride3_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512-NEXT:    vmovdqa 16(%rsi), %xmm3
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512-NEXT:    vprold $16, %xmm2, %xmm4
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX512-NEXT:    vprold $16, %xmm3, %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm5
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [21,21,0,22,22,0,23,23,0,0,0,0,1,1,0,2]
-; AVX512-NEXT:    vpermi2d (%rdx), %zmm5, %zmm6
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm4))
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm9
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
-; AVX512-NEXT:    vpor %ymm4, %ymm9, %ymm4
-; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm9
-; AVX512-NEXT:    vmovdqa 48(%rsi), %xmm11
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm12, %xmm7
-; AVX512-NEXT:    vprold $16, %xmm11, %xmm11
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm9, %ymm7
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm7[0,1,2,3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm9
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [5,5,0,6,6,0,7,7]
-; AVX512-NEXT:    vpermd %ymm5, %ymm11, %ymm5
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm9, %zmm5
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm4 & mem)
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm8
-; AVX512-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
-; AVX512-NEXT:    vpor %ymm4, %ymm8, %ymm4
-; AVX512-NEXT:    vprold $16, %xmm0, %xmm8
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm0
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[0,1,2,3]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm1
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512-NEXT:    vpermd %ymm3, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
-; AVX512-NEXT:    vmovdqa64 %zmm1, (%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 128(%rcx)
-; AVX512-NEXT:    vmovdqa64 %zmm6, 64(%rcx)
+; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm0
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512-NEXT:    vmovdqa 16(%rsi), %xmm4
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm3
+; AVX512-NEXT:    vprold $16, %xmm3, %xmm5
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm7
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm4, %ymm4
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm6
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [5,5,0,6,6,0,7,7,0,16,16,0,17,17,0,18]
+; AVX512-NEXT:    vpermt2d %zmm6, %zmm7, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5))
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm10
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
+; AVX512-NEXT:    vpor %ymm5, %ymm10, %ymm5
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm10
+; AVX512-NEXT:    vmovdqa 48(%rsi), %xmm12
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm13, %xmm8
+; AVX512-NEXT:    vprold $16, %xmm12, %xmm12
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm10, %ymm8
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm6, %ymm10
+; AVX512-NEXT:    vpermd %ymm6, %ymm7, %ymm6
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm6
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm5 & mem)
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
+; AVX512-NEXT:    vpor %ymm5, %ymm7, %ymm5
+; AVX512-NEXT:    vprold $16, %xmm1, %xmm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm1, %ymm1
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2]
+; AVX512-NEXT:    vpermd %ymm4, %ymm3, %ymm3
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpandn %ymm3, %ymm4, %ymm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & zmm4)
+; AVX512-NEXT:    vmovdqa64 %zmm2, (%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm6, 128(%rcx)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rcx)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride3_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 16(%rsi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512-FCP-NEXT:    vprold $16, %xmm2, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vprold $16, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm5
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [21,21,0,22,22,0,23,23,0,0,0,0,1,1,0,2]
-; AVX512-FCP-NEXT:    vpermi2d (%rdx), %zmm5, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm4))
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm9
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vpor %ymm4, %ymm9, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa 48(%rsi), %xmm11
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm7
-; AVX512-FCP-NEXT:    vprold $16, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm9, %ymm7
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm7[0,1,2,3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm9
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [5,5,0,6,6,0,7,7]
-; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm11, %ymm5
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm9, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm4 & mem)
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
-; AVX512-FCP-NEXT:    vpor %ymm4, %ymm8, %ymm4
-; AVX512-FCP-NEXT:    vprold $16, %xmm0, %xmm8
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[0,1,2,3]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rcx)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm0
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa 16(%rsi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
+; AVX512-FCP-NEXT:    vprold $16, %xmm3, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [5,5,0,6,6,0,7,7,0,16,16,0,17,17,0,18]
+; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm7, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5))
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm10
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm10, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm10
+; AVX512-FCP-NEXT:    vmovdqa 48(%rsi), %xmm12
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm8
+; AVX512-FCP-NEXT:    vprold $16, %xmm12, %xmm12
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm10, %ymm8
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm10
+; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm7, %ymm6
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm5 & mem)
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm7, %ymm5
+; AVX512-FCP-NEXT:    vprold $16, %xmm1, %xmm7
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2]
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpandn %ymm3, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zm...
[truncated]

@RKSimon RKSimon merged commit 73e93ec into llvm:main Mar 14, 2025
11 of 12 checks passed
@RKSimon RKSimon deleted the x86-concatop-bitcast branch March 14, 2025 12:10
frederik-h pushed a commit to frederik-h/llvm-project that referenced this pull request Mar 18, 2025
…catVectorOps from inside ConcatSubOperand. (llvm#131303)

Before falling back to creating a generic ISD::CONCAT_VECTORS node, see if we can directly concat the subvectors if we peek through any bitcasts.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants