-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] combineConcatVectorOps - concat per-lane v2f64/v4f64 shuffles into vXf64 vshufpd #143017
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesWe can always concatenate v2f64 per-lane shuffles into a single vshufpd instruction, assuming we can profitably concatenate at least one of its operands I was really hoping to get this into combineX86ShufflesRecursively but it still can't handle concatenation as well as combineConcatVectorOps. Patch is 28.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143017.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 760119bc62604..f3cc7d57fcfba 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58492,14 +58492,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
// concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
- // Only concat of subvector high halves which vperm2x128 is best at.
+ // Only concat of subvector high halves which vperm2x128 is best at or if
+ // it should fold into a subvector broadcast.
if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
- SrcVT1.is256BitVector() && SrcIdx0 == (NumSrcElts0 / 2) &&
- SrcIdx1 == (NumSrcElts1 / 2)) {
- return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
- DAG.getBitcast(VT, Src0.getOperand(0)),
- DAG.getBitcast(VT, Src1.getOperand(0)),
- DAG.getTargetConstant(0x31, DL, MVT::i8));
+ SrcVT1.is256BitVector()) {
+ assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
+ (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
+ "Bad subvector index");
+ if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
+ (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
+ unsigned Index = 0;
+ Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
+ Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
+ DAG.getBitcast(VT, Src0.getOperand(0)),
+ DAG.getBitcast(VT, Src1.getOperand(0)),
+ DAG.getTargetConstant(Index, DL, MVT::i8));
+ }
}
// Widen extract_subvector
// concat(extract_subvector(x,lo), extract_subvector(x,hi))
@@ -59312,6 +59321,45 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
return DAG.getBitcast(VT, Res);
}
+ // We can always convert per-lane vXf64 shuffles into VSHUFPD.
+ if (!IsSplat && NumOps == 2 && VT == MVT::v4f64 &&
+ all_of(Ops, [](SDValue Op) {
+ return Op.hasOneUse() && (Op.getOpcode() == X86ISD::MOVDDUP ||
+ Op.getOpcode() == X86ISD::SHUFP ||
+ Op.getOpcode() == X86ISD::VPERMILPI ||
+ Op.getOpcode() == X86ISD::BLENDI ||
+ Op.getOpcode() == X86ISD::UNPCKL ||
+ Op.getOpcode() == X86ISD::UNPCKH);
+ })) {
+ SmallVector<SDValue, 2> SrcOps0, SrcOps1;
+ SmallVector<int, 8> SrcMask0, SrcMask1;
+ if (getTargetShuffleMask(Ops[0], /*AllowSentinelZero=*/false, SrcOps0,
+ SrcMask0) &&
+ getTargetShuffleMask(Ops[1], /*AllowSentinelZero=*/false, SrcOps1,
+ SrcMask1)) {
+ assert(SrcMask0.size() == 2 && SrcMask1.size() == 2 && "Bad shuffles");
+ SDValue LHS[] = {SrcOps0[SrcMask0[0] / 2], SrcOps1[SrcMask1[0] / 2]};
+ SDValue RHS[] = {SrcOps0[SrcMask0[1] / 2], SrcOps1[SrcMask1[1] / 2]};
+ SDValue Concat0 =
+ combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
+ SDValue Concat1 =
+ combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
+ if (Concat0 || Concat1) {
+ unsigned SHUFPDMask = 0;
+ SHUFPDMask |= (SrcMask0[0] & 1) << 0;
+ SHUFPDMask |= (SrcMask0[1] & 1) << 1;
+ SHUFPDMask |= (SrcMask1[0] & 1) << 2;
+ SHUFPDMask |= (SrcMask1[1] & 1) << 3;
+ Concat0 =
+ Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
+ Concat1 =
+ Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
+ }
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
index 8d68f88249a9e..3e9fed78b56b4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll
@@ -163,16 +163,14 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
;
; AVX-LABEL: store_i64_stride2_vf4:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps (%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3]
; AVX-NEXT: vmovapd %ymm1, 32(%rdx)
-; AVX-NEXT: vmovaps %ymm0, (%rdx)
+; AVX-NEXT: vmovapd %ymm0, (%rdx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -343,16 +341,12 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
;
; AVX-LABEL: store_i64_stride2_vf8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX-NEXT: vmovaps (%rdi), %xmm2
-; AVX-NEXT: vmovaps 32(%rdi), %xmm3
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm1[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
@@ -360,9 +354,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[3],ymm3[3]
; AVX-NEXT: vmovapd %ymm3, 96(%rdx)
-; AVX-NEXT: vmovapd %ymm2, 32(%rdx)
-; AVX-NEXT: vmovaps %ymm1, 64(%rdx)
-; AVX-NEXT: vmovaps %ymm0, (%rdx)
+; AVX-NEXT: vmovapd %ymm2, 64(%rdx)
+; AVX-NEXT: vmovapd %ymm1, (%rdx)
+; AVX-NEXT: vmovapd %ymm0, 32(%rdx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -617,26 +611,18 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
;
; AVX-LABEL: store_i64_stride2_vf16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX-NEXT: vmovaps 64(%rsi), %xmm2
-; AVX-NEXT: vmovaps 96(%rsi), %xmm3
-; AVX-NEXT: vmovaps (%rdi), %xmm4
-; AVX-NEXT: vmovaps 32(%rdi), %xmm5
-; AVX-NEXT: vmovaps 64(%rdi), %xmm6
-; AVX-NEXT: vmovaps 96(%rdi), %xmm7
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm3[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm2[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[3],ymm3[3]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3]
@@ -651,12 +637,12 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[3],ymm7[3]
; AVX-NEXT: vmovapd %ymm7, 32(%rdx)
; AVX-NEXT: vmovapd %ymm6, 96(%rdx)
-; AVX-NEXT: vmovapd %ymm5, 160(%rdx)
-; AVX-NEXT: vmovapd %ymm4, 224(%rdx)
-; AVX-NEXT: vmovaps %ymm1, 64(%rdx)
-; AVX-NEXT: vmovaps %ymm0, (%rdx)
-; AVX-NEXT: vmovaps %ymm2, 128(%rdx)
-; AVX-NEXT: vmovaps %ymm3, 192(%rdx)
+; AVX-NEXT: vmovapd %ymm5, 64(%rdx)
+; AVX-NEXT: vmovapd %ymm4, (%rdx)
+; AVX-NEXT: vmovapd %ymm3, 160(%rdx)
+; AVX-NEXT: vmovapd %ymm2, 128(%rdx)
+; AVX-NEXT: vmovapd %ymm1, 192(%rdx)
+; AVX-NEXT: vmovapd %ymm0, 224(%rdx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -1117,47 +1103,31 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
;
; AVX-LABEL: store_i64_stride2_vf32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps 224(%rsi), %xmm0
-; AVX-NEXT: vmovaps 224(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 128(%rsi), %xmm1
-; AVX-NEXT: vmovaps 128(%rdi), %xmm2
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX-NEXT: vmovaps (%rsi), %xmm2
-; AVX-NEXT: vmovaps 32(%rsi), %xmm3
-; AVX-NEXT: vmovaps 64(%rsi), %xmm4
-; AVX-NEXT: vmovaps 96(%rsi), %xmm5
-; AVX-NEXT: vmovaps (%rdi), %xmm6
-; AVX-NEXT: vmovaps 32(%rdi), %xmm7
-; AVX-NEXT: vmovaps 64(%rdi), %xmm8
-; AVX-NEXT: vmovaps 96(%rdi), %xmm9
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm3[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm4[0]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm5[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX-NEXT: vmovaps 160(%rsi), %xmm6
-; AVX-NEXT: vmovaps 160(%rdi), %xmm7
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0]
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
-; AVX-NEXT: vmovaps 192(%rsi), %xmm7
-; AVX-NEXT: vmovaps 192(%rdi), %xmm8
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[3],ymm3[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm6 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[3],ymm5[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm6 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[3],ymm6[3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[3],ymm7[3]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm9 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3]
@@ -1188,17 +1158,17 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX-NEXT: vmovapd %ymm12, 32(%rdx)
; AVX-NEXT: vmovapd %ymm11, 96(%rdx)
; AVX-NEXT: vmovapd %ymm10, 160(%rdx)
-; AVX-NEXT: vmovapd %ymm9, 288(%rdx)
-; AVX-NEXT: vmovapd %ymm8, 480(%rdx)
-; AVX-NEXT: vmovaps %ymm7, 384(%rdx)
-; AVX-NEXT: vmovaps %ymm6, 320(%rdx)
-; AVX-NEXT: vmovaps %ymm5, 192(%rdx)
-; AVX-NEXT: vmovaps %ymm4, 128(%rdx)
-; AVX-NEXT: vmovaps %ymm3, 64(%rdx)
-; AVX-NEXT: vmovaps %ymm2, (%rdx)
-; AVX-NEXT: vmovaps %ymm1, 256(%rdx)
+; AVX-NEXT: vmovapd %ymm9, 384(%rdx)
+; AVX-NEXT: vmovapd %ymm8, 320(%rdx)
+; AVX-NEXT: vmovapd %ymm7, 192(%rdx)
+; AVX-NEXT: vmovapd %ymm6, 128(%rdx)
+; AVX-NEXT: vmovapd %ymm5, 64(%rdx)
+; AVX-NEXT: vmovapd %ymm4, (%rdx)
+; AVX-NEXT: vmovapd %ymm3, 288(%rdx)
+; AVX-NEXT: vmovapd %ymm2, 256(%rdx)
+; AVX-NEXT: vmovapd %ymm1, 448(%rdx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 448(%rdx)
+; AVX-NEXT: vmovaps %ymm0, 480(%rdx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -2080,102 +2050,70 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
; AVX-LABEL: store_i64_stride2_vf64:
; AVX: # %bb.0:
; AVX-NEXT: subq $424, %rsp # imm = 0x1A8
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX-NEXT: vmovaps 64(%rsi), %xmm2
-; AVX-NEXT: vmovaps 96(%rsi), %xmm3
-; AVX-NEXT: vmovaps (%rdi), %xmm4
-; AVX-NEXT: vmovaps 32(%rdi), %xmm5
-; AVX-NEXT: vmovaps 64(%rdi), %xmm6
-; AVX-NEXT: vmovaps 96(%rdi), %xmm7
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm4[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm1[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm2[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm2[0]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm3[0]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 128(%rsi), %xmm0
-; AVX-NEXT: vmovaps 128(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 160(%rsi), %xmm0
-; AVX-NEXT: vmovaps 160(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 192(%rsi), %xmm0
-; AVX-NEXT: vmovaps 192(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 224(%rsi), %xmm0
-; AVX-NEXT: vmovaps 224(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 256(%rsi), %xmm0
-; AVX-NEXT: vmovaps 256(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 288(%rsi), %xmm0
-; AVX-NEXT: vmovaps 288(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 320(%rsi), %xmm0
-; AVX-NEXT: vmovaps 320(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 352(%rsi), %xmm0
-; AVX-NEXT: vmovaps 352(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 384(%rsi), %xmm0
-; AVX-NEXT: vmovaps 384(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT: vmovaps 416(%rsi), %xmm0
-; AVX-NEXT: vmovaps 416(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 448(%rsi), %xmm0
-; AVX-NEXT: vmovaps 448(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 480(%rsi), %xmm0
-; AVX-NEXT: vmovaps 480(%rdi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = x...
[truncated]
|
…vshufpd We can always concatenate vXf64 per-lane shuffles into a single vshufpd instruction, assuming we can profitably concatenate at least one of its operands I was really hoping to get this into combineX86ShufflesRecursively but it still can't handle concatenation as well as combineConcatVectorOps yet.
17eb1d3
to
9e9fe5e
Compare
…ccepted shuffle opcode list)
Hi @RKSimon, this commit is my only suspect for the accuracy failure in 454.calculix (with By any chance, was there any issue reported about this commit anywhere else? Thank you. |
Can you provide a asm diff? |
Sure, I will. I just need to verify that the regression happens exactly after this commit. On the above range of commits, I do see changes in the shuffling, but I also see some reordering of FP math operations (which might be caused by some other pass kicking in after your change). I suppose |
I created #143606. Let's move the dicussion there. |
…nto vXf64 vshufpd (llvm#143017) We can always concatenate v2f64/v4f64 per-lane shuffles into a single vshufpd instruction, assuming we can profitably concatenate at least one of its operands (or its an unary shuffle). I was really hoping to get this into combineX86ShufflesRecursively but it still can't handle concatenation/length changing as well as combineConcatVectorOps.
…nto vXf64 vshufpd (llvm#143017) We can always concatenate v2f64/v4f64 per-lane shuffles into a single vshufpd instruction, assuming we can profitably concatenate at least one of its operands (or its an unary shuffle). I was really hoping to get this into combineX86ShufflesRecursively but it still can't handle concatenation/length changing as well as combineConcatVectorOps.
…nto vXf64 vshufpd (llvm#143017) We can always concatenate v2f64/v4f64 per-lane shuffles into a single vshufpd instruction, assuming we can profitably concatenate at least one of its operands (or its an unary shuffle). I was really hoping to get this into combineX86ShufflesRecursively but it still can't handle concatenation/length changing as well as combineConcatVectorOps.
@RKSimon we're seeing many significant increases in memory usage during compilation of certain translation units inside google. In several cases the memory consumption goes above 12GB (might go above this, we just stopped the compilation at this limit). The compilation times also increase significantly for such cases. a6ace28 does not fix the issue. We're working on a reproducer. |
The reduced test case is here: https://gcc.godbolt.org/z/e8f6bxPfT. Looks like an infinite loop, given that the size of the input is quite small. And here is the profile of a few seconds of the Clang execution:
|
And a simpler reproducer (using code after SLP vectorizer): https://gcc.godbolt.org/z/bMGrr8Gze |
Thanks - it looks like its awakened a latent bug in load combining (which hasn't been touched for a long time......) |
; ModuleID = 'bugpoint-reduced-simplified.bc'
source_filename = "bug.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @blam(ptr readonly align 8 captures(none) dereferenceable(64) %arg) #0 {
%getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 8
%i = load <6 x i64>, ptr %getelementptr, align 8
%i1 = shufflevector <6 x i64> %i, <6 x i64> poison, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
store <4 x i64> %i1, ptr poison, align 8
ret void
}
attributes #0 = { "target-features"="+avx" } |
It looks like #140919 was the actual culprit - I'm putting together a partial reversion. |
We can always concatenate v2f64/v4f64 per-lane shuffles into a single vshufpd instruction, assuming we can profitably concatenate at least one of its operands (or its an unary shuffle).
I was really hoping to get this into combineX86ShufflesRecursively but it still can't handle concatenation as well as combineConcatVectorOps.