Skip to content

Commit 1753aba

Browse files
authored
[X86] combineINSERT_SUBVECTOR - directly fold to X86ISD::SUBV_BROADCAST_LOAD to prevent vector split infinite loop (#145077)
This reverts #140919 / f1d03de - which could result in another fold trying to split the concatenation apart again before it was folded to a SUBV_BROADCAST_LOAD
1 parent 1db9afb commit 1753aba

File tree

3 files changed

+332
-281
lines changed

3 files changed

+332
-281
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59465,16 +59465,20 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5946559465
}
5946659466

5946759467
// If we're splatting the lower half subvector of a full vector load into the
59468-
// upper half, just splat the subvector directly, potentially creating a
59469-
// subvector broadcast.
59468+
// upper half, attempt to create a subvector broadcast.
5947059469
if ((int)IdxVal == (VecNumElts / 2) &&
5947159470
Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
5947259471
auto *VecLd = dyn_cast<LoadSDNode>(Vec);
5947359472
auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
5947459473
if (VecLd && SubLd &&
5947559474
DAG.areNonVolatileConsecutiveLoads(
5947659475
SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59477-
return concatSubVectors(SubVec, SubVec, DAG, dl);
59476+
SDValue BcastLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT,
59477+
SubVecVT, SubLd, 0, DAG);
59478+
SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59479+
BcastLd, DAG.getVectorIdxConstant(0, dl));
59480+
DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59481+
return BcastLd;
5947859482
}
5947959483
}
5948059484

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2508,3 +2508,50 @@ define void @D107009(ptr %input, ptr %output) {
25082508
store <64 x i32> %i7, ptr %output, align 16
25092509
ret void
25102510
}
2511+
2512+
; Ensure concatenation of repeated subvector loads before vector can be split apart.
2513+
define void @split_v2i64_subvector_broadcast(ptr readonly align 8 captures(none) dereferenceable(64) %arg) {
2514+
; SSE-LABEL: split_v2i64_subvector_broadcast:
2515+
; SSE: # %bb.0:
2516+
; SSE-NEXT: movups 8(%rdi), %xmm0
2517+
; SSE-NEXT: movups 40(%rdi), %xmm1
2518+
; SSE-NEXT: movaps %xmm0, %xmm2
2519+
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2520+
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2521+
; SSE-NEXT: movups %xmm0, (%rax)
2522+
; SSE-NEXT: movups %xmm2, (%rax)
2523+
; SSE-NEXT: retq
2524+
;
2525+
; AVX1-LABEL: split_v2i64_subvector_broadcast:
2526+
; AVX1: # %bb.0:
2527+
; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
2528+
; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
2529+
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2530+
; AVX1-NEXT: vmovupd %ymm0, (%rax)
2531+
; AVX1-NEXT: vzeroupper
2532+
; AVX1-NEXT: retq
2533+
;
2534+
; AVX2-LABEL: split_v2i64_subvector_broadcast:
2535+
; AVX2: # %bb.0:
2536+
; AVX2-NEXT: vmovups 40(%rdi), %xmm0
2537+
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
2538+
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,1,3]
2539+
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2540+
; AVX2-NEXT: vmovups %ymm0, (%rax)
2541+
; AVX2-NEXT: vzeroupper
2542+
; AVX2-NEXT: retq
2543+
;
2544+
; XOP-LABEL: split_v2i64_subvector_broadcast:
2545+
; XOP: # %bb.0:
2546+
; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
2547+
; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
2548+
; XOP-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2549+
; XOP-NEXT: vmovupd %ymm0, (%rax)
2550+
; XOP-NEXT: vzeroupper
2551+
; XOP-NEXT: retq
2552+
%gep = getelementptr inbounds nuw i8, ptr %arg, i64 8
2553+
%load = load <6 x i64>, ptr %gep, align 8
2554+
%shuffle = shufflevector <6 x i64> %load, <6 x i64> poison, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2555+
store <4 x i64> %shuffle, ptr poison, align 8
2556+
ret void
2557+
}

0 commit comments

Comments
 (0)