-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Fold VPERMV(MASK,CONCAT(LO,HI)) -> VPERMV3(WIDEN(LO),MASK',WIDEN(HI)) #129708
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf the VPERMV node is shuffling a source that is concatenated from separate subvectors, attempt to shuffle from the separate subvectors directly using an equivalent VPERMV3 node Patch is 1.31 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129708.diff 13 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8f0ba3e24ed02..28c1321b7560f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42597,6 +42597,46 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
return SDValue();
}
+ case X86ISD::VPERMV: {
+ // Combine VPERMV to VPERMV3 if the source operand can be freely split.
+ SmallVector<int, 32> Mask;
+ SmallVector<SDValue, 2> SrcOps, SubOps;
+ SDValue Src = peekThroughBitcasts(N.getOperand(1));
+ if ((Subtarget.hasVLX() ||
+ (VT.is512BitVector() && Subtarget.hasAVX512())) &&
+ getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
+ collectConcatOps(Src.getNode(), SubOps, DAG)) {
+ assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
+ assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
+ assert((SubOps.size() == 2 || SubOps.size() == 4) &&
+ "Unexpected split ops");
+ // Bail if we were permuting a widened vector.
+ if (SubOps[SubOps.size() - 1].isUndef())
+ return SDValue();
+ // Bail if any subops would have folded into the concat.
+ if (any_of(SubOps, [](SDValue Op) { return isShuffleFoldableLoad(Op); }))
+ return SDValue();
+ // Concat 4x128 back to 2x256.
+ if (SubOps.size() == 4) {
+ SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
+ SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
+ }
+ // Convert mask to 2 operand shuffle.
+ int HalfElts = NumElts / 2;
+ for (int &M : Mask)
+ M += M >= HalfElts ? HalfElts : 0;
+ MVT MaskVT = N.getOperand(0).getSimpleValueType();
+ SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
+ /*IsMask=*/true);
+ SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
+ VT.getSizeInBits());
+ SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
+ VT.getSizeInBits());
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, DAG.getBitcast(VT, Lo),
+ NewMask, DAG.getBitcast(VT, Hi));
+ }
+ return SDValue();
+ }
case X86ISD::VPERMV3: {
// Combine VPERMV3 to widened VPERMV if the two source operands can be
// freely concatenated.
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 951a2b4cafa26..70c71ea6c8983 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -4242,16 +4242,15 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
-; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,0,23]
+; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -4259,16 +4258,15 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,0,23]
+; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -4371,16 +4369,15 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
-; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,0]
+; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -4388,16 +4385,15 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,0]
+; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -4508,16 +4504,15 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
-; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
+; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -4525,16 +4520,15 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
+; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -4636,12 +4630,11 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
-; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
+; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -4650,12 +4643,11 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
+; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -4769,16 +4761,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
+; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -4786,16 +4777,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
-; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
+; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -4898,12 +4888,11 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
+; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -4912,12 +4901,11 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
-; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
+; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 4a2e7d55d3e88..114b9f2c858bc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -1674,58 +1674,59 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512-NEXT: vpermd (%rdx), %zmm4, %zmm5
-; AVX512-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-NEXT: vmovdqa 32(%rsi), %ymm8
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm8
-; AVX512-NEXT: vpor %ymm3, %ymm8, %ymm3
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm8
-; AVX512-NEXT: vmovdqa 48(%rsi), %xmm10
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512-NEXT: vpshufb %xmm7, %xmm11, %xmm7
-; AVX512-NEXT: vprold $16, %xmm10, %xmm10
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512-NEXT: vmovdqa (%rdx), %ymm3
+; AVX512-NEXT: vmovdqa 32(%rdx), %ymm5
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [21,21,0,22,22,0,23,23,0,0,0,0,1,1,0,2]
+; AVX512-NEXT: vpermi2d (%rdx), %zmm5, %zmm6
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm4))
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-NEXT: vpshufb %ymm8, %ymm4, %ymm4
+; AVX512-NEXT: vmovdqa 32(%rsi), %ymm9
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm9
+; AVX512-NEXT: vpor %ymm4, %ymm9, %ymm4
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm9
+; AVX512-NEXT: vmovdqa 48(%rsi), %xmm11
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm7
+; AVX512-NEXT: vprold $16, %xmm11, %xmm11
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7
; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[4,5,6,7]
-; AVX512-NEXT: vmovdqa (%rdx), %ymm7
-; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11
-; AVX512-NEXT: vpermd %ymm8, %ymm4, %ymm4
-; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem)
-; AVX512-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-NEXT: vmovdqa (%rsi), %ymm6
-; AVX512-NEXT: vpshufb %ymm9, %ymm6, %ymm6
-; AVX512-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
-; AVX512-NEXT: vprold $16, %xmm0, %xmm6
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm7[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm9
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [5,5,0,6,6,0,7,7]
+; AVX512-NEXT: vpermd %ymm5, %ymm11, %ymm5
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
+; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %z...
[truncated]
|
1f2d0e8
to
c464a85
Compare
…N(HI)) If the VPERMV node is shuffling a source that is concatenated from separate subvectors, attempt to shuffle from the separate subvectors directly using an equivalent VPERMV3 node
c464a85
to
ae1bb7c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…N(HI)) (llvm#129708) If the VPERMV node is shuffling a source that is concatenated from separate subvectors, attempt to shuffle from the separate subvectors directly using an equivalent VPERMV3 node
If the VPERMV node is shuffling a source that is concatenated from separate subvectors, attempt to shuffle from the separate subvectors directly using an equivalent VPERMV3 node