Skip to content

[X86] combineX86ShufflesRecursively - iteratively peek through bitcasts to free subvector widening/narrowing sources. #134701

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 8, 2025

Conversation

RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Apr 7, 2025

Generalizes the existing code to repeatedly peek though mixed bitcast/insert_subvector/extract_subvector chains to find the source of the shuffle operand.

…ts to free subvector widening/narrowing sources.

Generalizes the existing code to repeatedly peek though mixed bitcast/insert_subvector/extract_subvector chains to find the source of the shuffle operand.
@llvmbot
Copy link
Member

llvmbot commented Apr 7, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Generalizes the existing code to repeatedly peek though mixed bitcast/insert_subvector/extract_subvector chains to find the source of the shuffle operand.


Patch is 937.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134701.diff

11 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+29-22)
  • (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll (+5-8)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll (+8-8)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll (+190-190)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+2796-2782)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll (+44-44)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+600-594)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll (+448-468)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+970-970)
  • (modified) llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll (+111-363)
  • (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+11-11)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bac5684733e60..d86eec1584274 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41115,30 +41115,37 @@ static SDValue combineX86ShufflesRecursively(
     }
   }
 
+  // Peek through any free bitcasts to insert_subvector vector widenings or
+  // extract_subvector nodes back to root size.
+  // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
   for (auto [I, Op] : enumerate(Ops)) {
-    // Peek through vector widenings + set out of bounds mask indices to undef.
-    // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
-    if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
-        isNullConstant(Op.getOperand(2))) {
-      Op = Op.getOperand(1);
-      unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
-      int Lo = I * Mask.size();
-      int Hi = (I + 1) * Mask.size();
-      int NewHi = Lo + (Mask.size() / Scale);
-      for (int &M : Mask) {
-        if (Lo <= M && NewHi <= M && M < Hi)
-          M = SM_SentinelUndef;
-      }
-    }
-
-    // Peek through any free bitcasts/extract_subvector nodes back to root size.
     SDValue BC = Op;
-    if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse())
-      BC = peekThroughOneUseBitcasts(BC);
-    while (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-           (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
-           isNullConstant(BC.getOperand(1))) {
-      Op = BC = BC.getOperand(0);
+    while (1) {
+      if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
+        BC = BC.getOperand(0);
+        continue;
+      }
+      if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
+          BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
+        // Set out of bounds mask indices to undef.
+        Op = BC = BC.getOperand(1);
+        unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
+        int Lo = I * Mask.size();
+        int Hi = (I + 1) * Mask.size();
+        int NewHi = Lo + (Mask.size() / Scale);
+        for (int &M : Mask) {
+          if (Lo <= M && NewHi <= M && M < Hi)
+            M = SM_SentinelUndef;
+        }
+        continue;
+      }
+      if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+          (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
+          isNullConstant(BC.getOperand(1))) {
+        Op = BC = BC.getOperand(0);
+        continue;
+      }
+      break;
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index b075d48627b18..1fada58f05ba9 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -4708,18 +4708,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm3
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm3
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
-; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index a39bc6b668669..da902b3aed5ab 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -1836,7 +1836,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
 ; AVX512-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -1858,7 +1858,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
@@ -1914,7 +1914,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
 ; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -1936,7 +1936,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
@@ -1992,7 +1992,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
 ; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -2014,7 +2014,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
@@ -2070,7 +2070,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -2092,7 +2092,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index 25bad7578c111..ca8fcf2ee0f2c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -4230,91 +4230,91 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
 ; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,0,2,1,8,9,8,9]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [8,8,0,9,0,1,0,1]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm20, %zmm5
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm1
 ; AVX512-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-FCP-NEXT:    kmovw %eax, %k2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm5 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm20 = [16,9,10,17,12,13,18,15]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm10
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm6, %zmm5
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm10[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm20, %zmm1
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm19, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm8
+; AVX512-FCP-NEXT:    vpermt2d %ymm6, %ymm19, %ymm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
 ; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm2 {%k2}
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm3
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm4, %ymm19
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm4
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,10,9]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm19, %zmm19
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm2, %ymm19
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm20, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,0,9]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm8, %xmm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm8[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm20, %zmm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm19, %zmm19
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm0
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm20, %zmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm20, %zmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm5
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm7, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm6 {%k1}
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm9, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm7 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
 ; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm8
-; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm3, %ymm8
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm14
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm3, %ymm14
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [8,9,20,11,12,21,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm15, %zmm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm6
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm15, %zmm7
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm14, %zmm7
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [0,1,0,1,10,10,10,10]
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm20, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm14
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm20, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[8],ymm5[8],ymm14[9],ymm5[9],ymm14[10],ymm5[10],ymm14[11],ymm5[11]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5]...
[truncated]

@RKSimon RKSimon merged commit 83fbe67 into llvm:main Apr 8, 2025
11 checks passed
@RKSimon RKSimon deleted the x86-shuffle-through-subvectors branch April 8, 2025 10:28
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants