@@ -1799,8 +1799,7 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1799
1799
// / }
1800
1800
static bool isSafeDependenceDistance (const DataLayout &DL, ScalarEvolution &SE,
1801
1801
const SCEV &MaxBTC, const SCEV &Dist,
1802
- uint64_t MaxStride,
1803
- uint64_t TypeByteSize) {
1802
+ uint64_t MaxStride) {
1804
1803
1805
1804
// If we can prove that
1806
1805
// (**) |Dist| > MaxBTC * Step
@@ -1819,8 +1818,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1819
1818
// will be executed only if LoopCount >= VF, proving distance >= LoopCount
1820
1819
// also guarantees that distance >= VF.
1821
1820
//
1822
- const uint64_t ByteStride = MaxStride * TypeByteSize;
1823
- const SCEV *Step = SE.getConstant (MaxBTC.getType (), ByteStride);
1821
+ const SCEV *Step = SE.getConstant (MaxBTC.getType (), MaxStride);
1824
1822
const SCEV *Product = SE.getMulExpr (&MaxBTC, Step);
1825
1823
1826
1824
const SCEV *CastedDist = &Dist;
@@ -1864,9 +1862,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1864
1862
if (Distance % TypeByteSize)
1865
1863
return false ;
1866
1864
1867
- uint64_t ScaledDist = Distance / TypeByteSize;
1868
-
1869
- // No dependence if the scaled distance is not multiple of the stride.
1865
+ // No dependence if the distance is not multiple of the stride.
1870
1866
// E.g.
1871
1867
// for (i = 0; i < 1024 ; i += 4)
1872
1868
// A[i+2] = A[i] + 1;
@@ -1882,7 +1878,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1882
1878
// Two accesses in memory (scaled distance is 4, stride is 3):
1883
1879
// | A[0] | | | A[3] | | | A[6] | | |
1884
1880
// | | | | | A[4] | | | A[7] | |
1885
- return ScaledDist % Stride;
1881
+ return Distance % Stride;
1886
1882
}
1887
1883
1888
1884
std::variant<MemoryDepChecker::Dependence::DepType,
@@ -1921,6 +1917,7 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
1921
1917
if (StrideAPtr && *StrideAPtr < 0 ) {
1922
1918
std::swap (Src, Sink);
1923
1919
std::swap (AInst, BInst);
1920
+ std::swap (ATy, BTy);
1924
1921
std::swap (StrideAPtr, StrideBPtr);
1925
1922
}
1926
1923
@@ -1972,30 +1969,68 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
1972
1969
return MemoryDepChecker::Dependence::IndirectUnsafe;
1973
1970
}
1974
1971
1975
- int64_t StrideAPtrInt = *StrideAPtr;
1976
- int64_t StrideBPtrInt = *StrideBPtr;
1977
- LLVM_DEBUG (dbgs () << " LAA: Src induction step: " << StrideAPtrInt
1978
- << " Sink induction step: " << StrideBPtrInt << " \n " );
1972
+ LLVM_DEBUG (dbgs () << " LAA: Src induction step: " << *StrideAPtr
1973
+ << " Sink induction step: " << *StrideBPtr << " \n " );
1974
+
1975
+ // Note that store size is different from alloc size, which is dependent on
1976
+ // store size. We use the former for checking illegal cases, and the latter
1977
+ // for scaling strides.
1978
+ TypeSize AStoreSz = DL.getTypeStoreSize (ATy),
1979
+ BStoreSz = DL.getTypeStoreSize (BTy);
1980
+
1981
+ // When the distance is zero, we're reading/writing the same memory location:
1982
+ // check that the store sizes are equal. Otherwise, fail with an unknown
1983
+ // dependence for which we should not generate runtime checks.
1984
+ if (Dist->isZero () && AStoreSz != BStoreSz)
1985
+ return MemoryDepChecker::Dependence::Unknown;
1986
+
1987
+ // We can't get get a uint64_t for the AllocSize if either of the store sizes
1988
+ // are scalable.
1989
+ if (AStoreSz.isScalable () || BStoreSz.isScalable ())
1990
+ return MemoryDepChecker::Dependence::Unknown;
1991
+
1992
+ // The TypeByteSize is used to scale Distance and VF. In these contexts, the
1993
+ // only size that matters is the size of the Sink.
1994
+ uint64_t ASz = alignTo (AStoreSz, DL.getABITypeAlign (ATy).value ()),
1995
+ TypeByteSize = alignTo (BStoreSz, DL.getABITypeAlign (BTy).value ());
1996
+
1997
+ // We scale the strides by the alloc-type-sizes, so we can check that the
1998
+ // common distance is equal when ASz != BSz.
1999
+ int64_t StrideAScaled = *StrideAPtr * ASz;
2000
+ int64_t StrideBScaled = *StrideBPtr * TypeByteSize;
2001
+
1979
2002
// At least Src or Sink are loop invariant and the other is strided or
1980
2003
// invariant. We can generate a runtime check to disambiguate the accesses.
1981
- if (!StrideAPtrInt || !StrideBPtrInt )
2004
+ if (!StrideAScaled || !StrideBScaled )
1982
2005
return MemoryDepChecker::Dependence::Unknown;
1983
2006
1984
2007
// Both Src and Sink have a constant stride, check if they are in the same
1985
2008
// direction.
1986
- if ((StrideAPtrInt > 0 ) != (StrideBPtrInt > 0 )) {
2009
+ if ((StrideAScaled > 0 ) != (StrideBScaled > 0 )) {
1987
2010
LLVM_DEBUG (
1988
2011
dbgs () << " Pointer access with strides in different directions\n " );
1989
2012
return MemoryDepChecker::Dependence::Unknown;
1990
2013
}
1991
2014
1992
- uint64_t TypeByteSize = DL.getTypeAllocSize (ATy);
1993
- bool HasSameSize =
1994
- DL.getTypeStoreSizeInBits (ATy) == DL.getTypeStoreSizeInBits (BTy);
1995
- if (!HasSameSize)
1996
- TypeByteSize = 0 ;
1997
- return DepDistanceStrideAndSizeInfo (Dist, std::abs (StrideAPtrInt),
1998
- std::abs (StrideBPtrInt), TypeByteSize,
2015
+ StrideAScaled = std::abs (StrideAScaled);
2016
+ StrideBScaled = std::abs (StrideBScaled);
2017
+
2018
+ // MaxStride is the max of the scaled strides, as expected.
2019
+ uint64_t MaxStride = std::max (StrideAScaled, StrideBScaled);
2020
+
2021
+ // CommonStride is set if both scaled strides are equal.
2022
+ std::optional<uint64_t > CommonStride;
2023
+ if (StrideAScaled == StrideBScaled)
2024
+ CommonStride = StrideAScaled;
2025
+
2026
+ // TODO: Historically, we don't retry with runtime checks unless the unscaled
2027
+ // strides are the same, but this doesn't make sense. Fix this once the
2028
+ // condition for runtime checks in isDependent is fixed.
2029
+ bool ShouldRetryWithRuntimeCheck =
2030
+ std::abs (*StrideAPtr) == std::abs (*StrideBPtr);
2031
+
2032
+ return DepDistanceStrideAndSizeInfo (Dist, MaxStride, CommonStride,
2033
+ ShouldRetryWithRuntimeCheck, TypeByteSize,
1999
2034
AIsWrite, BIsWrite);
2000
2035
}
2001
2036
@@ -2011,32 +2046,28 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2011
2046
if (std::holds_alternative<Dependence::DepType>(Res))
2012
2047
return std::get<Dependence::DepType>(Res);
2013
2048
2014
- auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] =
2049
+ auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck,
2050
+ TypeByteSize, AIsWrite, BIsWrite] =
2015
2051
std::get<DepDistanceStrideAndSizeInfo>(Res);
2016
- bool HasSameSize = TypeByteSize > 0 ;
2017
2052
2018
- std::optional<uint64_t > CommonStride =
2019
- StrideA == StrideB ? std::make_optional (StrideA) : std::nullopt;
2020
2053
if (isa<SCEVCouldNotCompute>(Dist)) {
2021
- // TODO: Relax requirement that there is a common stride to retry with
2022
- // non-constant distance dependencies.
2023
- FoundNonConstantDistanceDependence |= CommonStride. has_value () ;
2054
+ // TODO: Relax requirement that there is a common unscaled stride to retry
2055
+ // with non-constant distance dependencies.
2056
+ FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck ;
2024
2057
LLVM_DEBUG (dbgs () << " LAA: Dependence because of uncomputable distance.\n " );
2025
2058
return Dependence::Unknown;
2026
2059
}
2027
2060
2028
2061
ScalarEvolution &SE = *PSE.getSE ();
2029
2062
auto &DL = InnermostLoop->getHeader ()->getDataLayout ();
2030
- uint64_t MaxStride = std::max (StrideA, StrideB);
2031
2063
2032
2064
// If the distance between the acecsses is larger than their maximum absolute
2033
2065
// stride multiplied by the symbolic maximum backedge taken count (which is an
2034
2066
// upper bound of the number of iterations), the accesses are independet, i.e.
2035
2067
// they are far enough appart that accesses won't access the same location
2036
2068
// across all loop ierations.
2037
- if (HasSameSize && isSafeDependenceDistance (
2038
- DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount ()),
2039
- *Dist, MaxStride, TypeByteSize))
2069
+ if (isSafeDependenceDistance (
2070
+ DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount ()), *Dist, MaxStride))
2040
2071
return Dependence::NoDep;
2041
2072
2042
2073
const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
@@ -2047,7 +2078,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2047
2078
2048
2079
// If the distance between accesses and their strides are known constants,
2049
2080
// check whether the accesses interlace each other.
2050
- if (Distance > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
2081
+ if (Distance > 0 && CommonStride && CommonStride > 1 &&
2051
2082
areStridedAccessesIndependent (Distance, *CommonStride, TypeByteSize)) {
2052
2083
LLVM_DEBUG (dbgs () << " LAA: Strided accesses are independent\n " );
2053
2084
return Dependence::NoDep;
@@ -2061,15 +2092,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2061
2092
2062
2093
// Negative distances are not plausible dependencies.
2063
2094
if (SE.isKnownNonPositive (Dist)) {
2064
- if (SE.isKnownNonNegative (Dist)) {
2065
- if (HasSameSize) {
2066
- // Write to the same location with the same size.
2067
- return Dependence::Forward;
2068
- }
2069
- LLVM_DEBUG (dbgs () << " LAA: possibly zero dependence difference but "
2070
- " different type sizes\n " );
2071
- return Dependence::Unknown;
2072
- }
2095
+ if (SE.isKnownNonNegative (Dist))
2096
+ // Write to the same location.
2097
+ return Dependence::Forward;
2073
2098
2074
2099
bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2075
2100
// Check if the first access writes to a location that is read in a later
@@ -2084,13 +2109,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2084
2109
if (!ConstDist) {
2085
2110
// TODO: FoundNonConstantDistanceDependence is used as a necessary
2086
2111
// condition to consider retrying with runtime checks. Historically, we
2087
- // did not set it when strides were different but there is no inherent
2088
- // reason to.
2089
- FoundNonConstantDistanceDependence |= CommonStride. has_value () ;
2112
+ // did not set it when unscaled strides were different but there is no
2113
+ // inherent reason to.
2114
+ FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck ;
2090
2115
return Dependence::Unknown;
2091
2116
}
2092
- if (!HasSameSize ||
2093
- couldPreventStoreLoadForward (
2117
+ if (couldPreventStoreLoadForward (
2094
2118
ConstDist->getAPInt ().abs ().getZExtValue (), TypeByteSize)) {
2095
2119
LLVM_DEBUG (
2096
2120
dbgs () << " LAA: Forward but may prevent st->ld forwarding\n " );
@@ -2105,27 +2129,20 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2105
2129
int64_t MinDistance = SE.getSignedRangeMin (Dist).getSExtValue ();
2106
2130
// Below we only handle strictly positive distances.
2107
2131
if (MinDistance <= 0 ) {
2108
- FoundNonConstantDistanceDependence |= CommonStride. has_value () ;
2132
+ FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck ;
2109
2133
return Dependence::Unknown;
2110
2134
}
2111
2135
2112
- if (!ConstDist) {
2136
+ if (!ConstDist)
2113
2137
// Previously this case would be treated as Unknown, possibly setting
2114
2138
// FoundNonConstantDistanceDependence to force re-trying with runtime
2115
2139
// checks. Until the TODO below is addressed, set it here to preserve
2116
2140
// original behavior w.r.t. re-trying with runtime checks.
2117
2141
// TODO: FoundNonConstantDistanceDependence is used as a necessary
2118
2142
// condition to consider retrying with runtime checks. Historically, we
2119
- // did not set it when strides were different but there is no inherent
2120
- // reason to.
2121
- FoundNonConstantDistanceDependence |= CommonStride.has_value ();
2122
- }
2123
-
2124
- if (!HasSameSize) {
2125
- LLVM_DEBUG (dbgs () << " LAA: ReadWrite-Write positive dependency with "
2126
- " different type sizes\n " );
2127
- return Dependence::Unknown;
2128
- }
2143
+ // did not set it when unscaled strides were different but there is no
2144
+ // inherent reason to.
2145
+ FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
2129
2146
2130
2147
if (!CommonStride)
2131
2148
return Dependence::Unknown;
@@ -2140,8 +2157,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2140
2157
2141
2158
// It's not vectorizable if the distance is smaller than the minimum distance
2142
2159
// needed for a vectroized/unrolled version. Vectorizing one iteration in
2143
- // front needs TypeByteSize * Stride . Vectorizing the last iteration needs
2144
- // TypeByteSize (No need to plus the last gap distance).
2160
+ // front needs CommonStride . Vectorizing the last iteration needs TypeByteSize
2161
+ // (No need to plus the last gap distance).
2145
2162
//
2146
2163
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2147
2164
// foo(int *A) {
@@ -2168,8 +2185,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2168
2185
// We know that Dist is positive, but it may not be constant. Use the signed
2169
2186
// minimum for computations below, as this ensures we compute the closest
2170
2187
// possible dependence distance.
2171
- uint64_t MinDistanceNeeded =
2172
- TypeByteSize * *CommonStride * (MinNumIter - 1 ) + TypeByteSize;
2188
+ uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1 ) + TypeByteSize;
2173
2189
if (MinDistanceNeeded > static_cast <uint64_t >(MinDistance)) {
2174
2190
if (!ConstDist) {
2175
2191
// For non-constant distances, we checked the lower bound of the
@@ -2225,7 +2241,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2225
2241
2226
2242
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
2227
2243
// since there is a backwards dependency.
2228
- uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * * CommonStride) ;
2244
+ uint64_t MaxVF = MinDepDistBytes / * CommonStride;
2229
2245
LLVM_DEBUG (dbgs () << " LAA: Positive min distance " << MinDistance
2230
2246
<< " with max VF = " << MaxVF << ' \n ' );
2231
2247
0 commit comments