-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Fold VPERMV3(WIDEN(X),M,WIDEN(Y)) -> VPERMV(CONCAT(X,Y),M') iff the CONCAT is free #122750
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
… the CONCAT is free Minor followup to llvm#122485 - if the source operands were widened half-size subvectors, then attempt to concatenate the subvectors directly, and then adjust the shuffle mask so references to the second operand now refer to the upper half of the concat result.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesMinor followup to #122485 - if the source operands were widened half-size subvectors, then attempt to concatenate the subvectors directly, and then adjust the shuffle mask so references to the second operand now refer to the upper half of the concat result. Full diff: https://github.com/llvm/llvm-project/pull/122750.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index add51fac4b9e62..8a0937c952180c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41711,9 +41711,10 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ using namespace SDPatternMatch;
+
MVT VT = N.getSimpleValueType();
unsigned NumElts = VT.getVectorNumElements();
-
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -42436,6 +42437,24 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
/*IsMask=*/true);
return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, N.getOperand(0));
}
+ // If sources are half width, then concat and use VPERMV with adjusted
+ // mask.
+ SDValue Ops[2];
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ if (sd_match(V1,
+ m_InsertSubvector(m_Undef(), m_Value(Ops[0]), m_Zero())) &&
+ sd_match(V2,
+ m_InsertSubvector(m_Undef(), m_Value(Ops[1]), m_Zero())) &&
+ Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
+ if (SDValue ConcatSrc =
+ combineConcatVectorOps(DL, VT, Ops, DAG, DCI, Subtarget)) {
+ for (int &M : Mask)
+ M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
+ SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
+ /*IsMask=*/true);
+ return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, ConcatSrc);
+ }
+ }
// Commute foldable source to the RHS.
if (isShuffleFoldableLoad(N.getOperand(0)) &&
!isShuffleFoldableLoad(N.getOperand(2))) {
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 1d82d57e5552fe..994c9f2446c80b 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -442,11 +442,9 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512BW-LABEL: PR34175:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,32,40,0,0,0,0]
-; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
-; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2
-; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
+; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512BW-NEXT: retq
;
@@ -460,11 +458,9 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512VBMI-LABEL: PR34175:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = [0,8,32,40,0,0,0,0]
-; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
-; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2
-; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
-; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
+; AVX512VBMI-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VBMI-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
index ded7c002c8735b..2721540305491d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
@@ -339,17 +339,14 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3
-; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
-; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
-; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
+; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -378,17 +375,14 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3
-; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
-; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
-; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -417,17 +411,14 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm2
-; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm3
-; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
-; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
-; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -456,17 +447,14 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm3
-; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
-; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
-; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64
|
// If sources are half width, then concat and use VPERMV with adjusted | ||
// mask. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How we check the CONCAT is free?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
combineConcatVectorOps only combines the ops if it results in further concatenated combines (sequential loads/broadcasts, subvector/shuffle simplifications, concatenation of additional instructions etc.) - it never just returns a CONCAT_VECTORS node, as that is likely to break apart again in later folds.
…k prefixes Still some codegen difference between AVX512BW + AVX512BWVL to address
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Minor followup to #122485 - if the source operands were widened half-size subvectors, then attempt to concatenate the subvectors directly, and then adjust the shuffle mask so references to the second operand now refer to the upper half of the concat result.