Skip to content

Commit a7f226f

Browse files
committed
AArch64: avoid creating cycle in DAG for post-increment NEON ops.
Inserting a value into Visited has the effect of terminating a search for predecessors if that node is seen. This is legitimate for the base address, and acts as a slight performance optimization, but the vector-building node can be paert of a legitimate cycle so we shouldn't stop searching there. PR43056. llvm-svn: 370036
1 parent bccbd74 commit a7f226f

File tree

2 files changed

+20
-1
lines changed

2 files changed

+20
-1
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10694,7 +10694,7 @@ static SDValue performPostLD1Combine(SDNode *N,
1069410694
// are predecessors to each other or the Vector.
1069510695
SmallPtrSet<const SDNode *, 32> Visited;
1069610696
SmallVector<const SDNode *, 16> Worklist;
10697-
Visited.insert(N);
10697+
Visited.insert(Addr.getNode());
1069810698
Worklist.push_back(User);
1069910699
Worklist.push_back(LD);
1070010700
Worklist.push_back(Vector.getNode());

llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6319,3 +6319,22 @@ define void @test_ld1lane_build_i8(i8* %a, i8* %b, i8* %c, i8* %d, i8* %e, i8*
63196319
store <8 x i8> %sub, <8 x i8>* %p
63206320
ret void
63216321
}
6322+
6323+
define <4 x i32> @test_inc_cycle(<4 x i32> %vec, i32* %in) {
6324+
; CHECK-LABEL: test_inc_cycle:
6325+
; CHECK: ld1.s { v0 }[0], [x0]{{$}}
6326+
6327+
%elt = load i32, i32* %in
6328+
%newvec = insertelement <4 x i32> %vec, i32 %elt, i32 0
6329+
6330+
; %inc cannot be %elt directly because we check that the load is only
6331+
; used by the insert before trying to form post-inc.
6332+
%inc.vec = bitcast <4 x i32> %newvec to <2 x i64>
6333+
%inc = extractelement <2 x i64> %inc.vec, i32 0
6334+
%newaddr = getelementptr i32, i32* %in, i64 %inc
6335+
store i32* %newaddr, i32** @var
6336+
6337+
ret <4 x i32> %newvec
6338+
}
6339+
6340+
@var = global i32* null

0 commit comments

Comments
 (0)