Skip to content

Commit 2103c89

Browse files
committed
LAA: be more precise on different store sizes
The HasSameSize checks, which are triggered on different store sizes, in MemDepChecker::isDependent are ad-hoc and imprecise, leading to spurious dependencies and runtime-checks. Identify that the exact scenario in which to bail out is unequal store sizes when dependence distance is zero, and check precisely this condition in MemDepChecker::getDependenceDistanceAndSize, eliminating all the ad-hoc checks in isDependent and making LoopAccessAnalysis more precise.
1 parent 418f5cd commit 2103c89

File tree

4 files changed

+60
-48
lines changed

4 files changed

+60
-48
lines changed

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1990,11 +1990,24 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
19901990
return MemoryDepChecker::Dependence::Unknown;
19911991
}
19921992

1993-
uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
1994-
bool HasSameSize =
1995-
DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
1996-
if (!HasSameSize)
1997-
TypeByteSize = 0;
1993+
// When the distance is zero, we're reading/writing the same memory location:
1994+
// check that the store sizes are equal. Otherwise, fail with an unknown
1995+
// dependence for which we should not generate runtime checks.
1996+
TypeSize AStoreSz = DL.getTypeStoreSize(ATy),
1997+
BStoreSz = DL.getTypeStoreSize(BTy);
1998+
if (Dist->isZero() && AStoreSz != BStoreSz) {
1999+
LLVM_DEBUG(
2000+
dbgs() << "LAA: zero dependence distance but different type sizes\n");
2001+
return Dependence::Unknown;
2002+
}
2003+
2004+
// Bail early on scalabale store sizes.
2005+
if (AStoreSz.isScalable() || BStoreSz.isScalable())
2006+
return Dependence::Unknown;
2007+
2008+
// The TypeByteSize is used to scale Distance and VF. In these contexts, the
2009+
// only size that matters is the size of the Sink.
2010+
uint64_t TypeByteSize = alignTo(BStoreSz, DL.getABITypeAlign(BTy).value());
19982011

19992012
StrideAPtrInt = std::abs(StrideAPtrInt);
20002013
StrideBPtrInt = std::abs(StrideBPtrInt);
@@ -2030,7 +2043,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20302043
auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck,
20312044
TypeByteSize, AIsWrite, BIsWrite] =
20322045
std::get<DepDistanceStrideAndSizeInfo>(Res);
2033-
bool HasSameSize = TypeByteSize > 0;
20342046

20352047
if (isa<SCEVCouldNotCompute>(Dist)) {
20362048
// TODO: Relax requirement that there is a common unscaled stride to retry
@@ -2048,9 +2060,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20482060
// upper bound of the number of iterations), the accesses are independet, i.e.
20492061
// they are far enough appart that accesses won't access the same location
20502062
// across all loop ierations.
2051-
if (HasSameSize && isSafeDependenceDistance(
2052-
DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
2053-
*Dist, MaxStride, TypeByteSize))
2063+
if (isSafeDependenceDistance(DL, SE,
2064+
*(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist,
2065+
MaxStride, TypeByteSize))
20542066
return Dependence::NoDep;
20552067

20562068
const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
@@ -2061,7 +2073,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20612073

20622074
// If the distance between accesses and their strides are known constants,
20632075
// check whether the accesses interlace each other.
2064-
if (Distance > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
2076+
if (Distance > 0 && CommonStride && CommonStride > 1 &&
20652077
areStridedAccessesIndependent(Distance, *CommonStride, TypeByteSize)) {
20662078
LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
20672079
return Dependence::NoDep;
@@ -2075,15 +2087,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20752087

20762088
// Negative distances are not plausible dependencies.
20772089
if (SE.isKnownNonPositive(Dist)) {
2078-
if (SE.isKnownNonNegative(Dist)) {
2079-
if (HasSameSize) {
2080-
// Write to the same location with the same size.
2081-
return Dependence::Forward;
2082-
}
2083-
LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2084-
"different type sizes\n");
2085-
return Dependence::Unknown;
2086-
}
2090+
if (SE.isKnownNonNegative(Dist))
2091+
// Write to the same location with the same size.
2092+
return Dependence::Forward;
20872093

20882094
bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
20892095
// Check if the first access writes to a location that is read in a later
@@ -2103,8 +2109,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21032109
FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
21042110
return Dependence::Unknown;
21052111
}
2106-
if (!HasSameSize ||
2107-
couldPreventStoreLoadForward(
2112+
if (couldPreventStoreLoadForward(
21082113
ConstDist->getAPInt().abs().getZExtValue(), TypeByteSize)) {
21092114
LLVM_DEBUG(
21102115
dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
@@ -2135,12 +2140,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21352140
FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
21362141
}
21372142

2138-
if (!HasSameSize) {
2139-
LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2140-
"different type sizes\n");
2141-
return Dependence::Unknown;
2142-
}
2143-
21442143
if (!CommonStride)
21452144
return Dependence::Unknown;
21462145

llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,8 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
129129
; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
130130
; CHECK-NEXT: loop:
131131
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
132-
; CHECK-NEXT: Unknown data dependence.
132+
; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding.
133133
; CHECK-NEXT: Dependences:
134-
; CHECK-NEXT: Unknown:
135-
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
136-
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
137-
; CHECK-EMPTY:
138-
; CHECK-NEXT: Unknown:
139-
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 ->
140-
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
141-
; CHECK-EMPTY:
142134
; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
143135
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
144136
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8

llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,6 @@ define void @forward_different_access_sizes(ptr readnone %end, ptr %start) {
7070
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
7171
; CHECK-NEXT: %l = load i24, ptr %gep.1, align 1
7272
; CHECK-EMPTY:
73-
; CHECK-NEXT: Forward:
74-
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
75-
; CHECK-NEXT: store i24 %l, ptr %ptr.iv, align 1
76-
; CHECK-EMPTY:
7773
; CHECK-NEXT: Run-time memory checks:
7874
; CHECK-NEXT: Grouped accesses:
7975
; CHECK-EMPTY:

llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ define void @pr58722_load_interleave_group(ptr %src, ptr %dst) {
3535
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1
3636
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1
3737
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 1
38-
; CHECK-NEXT: [[TMP13:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope !0
39-
; CHECK-NEXT: [[TMP14:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope !0
40-
; CHECK-NEXT: [[TMP15:%.*]] = load i24, ptr [[TMP11]], align 4, !alias.scope !0
41-
; CHECK-NEXT: [[TMP16:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope !0
38+
; CHECK-NEXT: [[TMP13:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope [[META0:![0-9]+]]
39+
; CHECK-NEXT: [[TMP14:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope [[META0]]
40+
; CHECK-NEXT: [[TMP15:%.*]] = load i24, ptr [[TMP11]], align 4, !alias.scope [[META0]]
41+
; CHECK-NEXT: [[TMP16:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope [[META0]]
4242
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i24> poison, i24 [[TMP13]], i32 0
4343
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i24> [[TMP17]], i24 [[TMP14]], i32 1
4444
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i24> [[TMP18]], i24 [[TMP15]], i32 2
@@ -47,7 +47,7 @@ define void @pr58722_load_interleave_group(ptr %src, ptr %dst) {
4747
; CHECK-NEXT: [[TMP22:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[TMP21]]
4848
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
4949
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0
50-
; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP24]], align 4, !alias.scope !3, !noalias !0
50+
; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP24]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
5151
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
5252
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
5353
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -96,17 +96,42 @@ exit:
9696
define void @pr58722_store_interleave_group(ptr %src, ptr %dst) {
9797
; CHECK-LABEL: @pr58722_store_interleave_group(
9898
; CHECK-NEXT: entry:
99+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
100+
; CHECK: vector.ph:
101+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
102+
; CHECK: vector.body:
103+
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
104+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2
105+
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
106+
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2
107+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i32 [[TMP0]]
108+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP1]]
109+
; CHECK-NEXT: store i32 [[TMP0]], ptr [[TMP2]], align 4
110+
; CHECK-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4
111+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 1
112+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i64 1
113+
; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP0]] to i24
114+
; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP1]] to i24
115+
; CHECK-NEXT: store i24 [[TMP6]], ptr [[TMP4]], align 4
116+
; CHECK-NEXT: store i24 [[TMP7]], ptr [[TMP5]], align 4
117+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
118+
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 5000
119+
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
120+
; CHECK: middle.block:
121+
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
122+
; CHECK: scalar.ph:
123+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
99124
; CHECK-NEXT: br label [[LOOP:%.*]]
100125
; CHECK: loop:
101-
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
102-
; CHECK-NEXT: [[GEP_IV:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i32 [[IV]]
126+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
127+
; CHECK-NEXT: [[GEP_IV:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[IV]]
103128
; CHECK-NEXT: store i32 [[IV]], ptr [[GEP_IV]], align 4
104129
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[GEP_IV]], i64 1
105130
; CHECK-NEXT: [[TRUNC_IV:%.*]] = trunc i32 [[IV]] to i24
106131
; CHECK-NEXT: store i24 [[TRUNC_IV]], ptr [[GEP]], align 4
107132
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 2
108133
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV]], 10000
109-
; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
134+
; CHECK-NEXT: br i1 [[CMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
110135
; CHECK: exit:
111136
; CHECK-NEXT: ret void
112137
;

0 commit comments

Comments
 (0)