-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] combineX86ShuffleChain - always combine to a new VPERMV node if the root shuffle was a VPERMV node #128183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
… the root shuffle was a VPERMV node Similar to what we already do for VPERMV3 nodes - if we're trying to create a new unary variable shuffle and we started with a VPERMV node then always create a new one if it reduces the shuffle chain depth
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesSimilar to what we already do for VPERMV3 nodes - if we're trying to create a new unary variable shuffle and we started with a VPERMV node then always create a new one if it reduces the shuffle chain depth Patch is 61.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128183.diff 6 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dd666f15bf57f..f8f7d3e367896 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40090,8 +40090,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
bool AllowBWIVPERMV3 =
(Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
- // If root was a VPERMV3 node, always allow a variable shuffle.
- if (Root.getOpcode() == X86ISD::VPERMV3)
+ // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
+ if ((UnaryShuffle && Root.getOpcode() == X86ISD::VPERMV) ||
+ Root.getOpcode() == X86ISD::VPERMV3)
AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
bool MaskContainsZeros = isAnyZero(Mask);
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 351d98540c2a5..3f8f061f359f9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -588,20 +588,19 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7]
-; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpor %ymm5, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,5,3,7,1,5,3,7]
+; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [5,7,1,3,7,0,0,0]
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm5[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[24,25,20,21,22,23,20,21,28,29]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3],zero,zero,ymm4[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,6,0,4,2,6]
; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u,u,u,u,u,u,u,18,19,22,23,26,27,u,u,u,u]
@@ -670,17 +669,16 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
-; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7]
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6]
+; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,3,7,1,5,3,7]
; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29]
@@ -753,17 +751,16 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %ymm5, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
-; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6]
+; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,3,7,1,5,3,7]
; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 6476c3139daa7..34100adacbeb9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -95,17 +95,17 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
-; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2]
-; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7]
-; AVX2-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u]
-; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1]
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2,2,1,4,6,6,5]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7]
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,3,0,1,5,3,0,1]
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u]
+; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3]
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovlps %xmm0, 48(%rax)
@@ -130,17 +130,17 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5]
-; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
-; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2]
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7]
-; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u]
-; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2
-; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1]
+; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6]
+; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm1
+; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2,2,1,4,6,6,5]
+; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7]
+; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,3,0,1,5,3,0,1]
; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FP-NEXT: vpermps %ymm0, %ymm3, %ymm0
+; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u]
+; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2
; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3]
; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-FP-NEXT: vmovlps %xmm0, 48(%rax)
@@ -165,23 +165,22 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm1 = [3,5,7,u]
-; AVX2-FCP-NEXT: vpermps %ymm2, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1]
-; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
-; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [0,2,4,6,u,u,u,1]
-; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm2
-; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,4,0,0,2,4,0]
+; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [5,3,0,1,5,3,0,1]
+; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u]
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,4,2,0,0,4,2,0]
; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FCP-NEXT: vpermps %ymm0, %ymm4, %ymm0
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [0,2,4,6,u,u,u,1]
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX2-FCP-NEXT: vmovlps %xmm2, 48(%rax)
+; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX2-FCP-NEXT: vmovlps %xmm1, 48(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
-; AVX2-FCP-NEXT: vmovaps %xmm1, 32(%rax)
+; AVX2-FCP-NEXT: vmovaps %xmm3, 32(%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index f7a44fea5b02b..01a2f7f46f939 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -5576,8 +5576,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm4, %xmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm4, %xmm5
; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
@@ -5585,15 +5585,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm9
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm18
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17
; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm5
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm10
; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1]
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm18
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
-; AVX512BW-FCP-NEXT: vpermd %ymm16, %ymm9, %ymm22
+; AVX512BW-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm22
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084
@@ -5611,9 +5611,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
; AVX512BW-FCP-NEXT: vpshufb %zmm15, %zmm22, %zmm22
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512BW-FCP-NEXT: vpshufb %zmm16, %zmm23, %zmm23
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm18[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512BW-FCP-NEXT: vpshufb %zmm18, %zmm23, %zmm23
; AVX512BW-FCP-NEXT: vporq %zmm22, %zmm23, %zmm22
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
@@ -5633,66 +5633,65 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: kmovq %rax, %k4
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm8 {%k4}
; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm12, %xmm17
-; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm17, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm12, %xmm16
+; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5]
; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm18, %xmm17
-; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm17, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm18[0],xmm13[1],xmm18[1],xmm13[2],xmm18[2],xmm13[3],xmm18[3],xmm13[4],xmm18[4],xmm13[5],xmm18[5],xmm13[6],xmm18[6],xmm13[7],xmm18[7]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm17, %xmm16
+; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm16, %xmm11
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3],xmm13[4],xmm17[4],xmm13[5],xmm17[5],xmm13[6],xmm17[6],xmm13[7],xmm17[7]
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm13, %xmm13
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5]
; AVX512BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
; AVX512BW-FCP-NEXT: kmovq %rax, %k4
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4}
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
-; AVX512BW-FCP-NEXT: vpermd %zmm13, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
+; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13
; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AV...
[truncated]
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Similar to what we already do for VPERMV3 nodes - if we're trying to create a new unary variable shuffle and we started with a VPERMV node then always create a new one if it reduces the shuffle chain depth