Skip to content

Commit 8397d75

Browse files
committed
[GVN] Load-store forwaring of scalable store to fixed load.
When storing a scalable vector and the vscale is a compile-time known constant, store-to-load forwarding through temporary @llvm.vector.extract calls, even if the loaded vector is fixed-sized instead of scalable. InstCombine then folds the insert/extract pair away. The usecase is shown in this [godbold link](https://godbolt.org/z/KT3sMrMbd), which shows that clang generates IR that matches this pattern when the "arm_sve_vector_bits" attribute is used: ```c typedef svfloat32_t svfloat32_fixed_t __attribute__((arm_sve_vector_bits(512))); struct svfloat32_wrapped_t { svfloat32_fixed_t v; }; static inline svfloat32_wrapped_t add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { return {svadd_f32_x(svptrue_b32(), a.v, b.v)}; } svfloat32_wrapped_t foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { // The IR pattern this patch matches is generated for this return: return add(a, b); } ```
1 parent 1343f8f commit 8397d75

File tree

4 files changed

+69
-24
lines changed

4 files changed

+69
-24
lines changed

llvm/include/llvm/Transforms/Utils/VNCoercion.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
namespace llvm {
2525
class Constant;
26+
class Function;
2627
class StoreInst;
2728
class LoadInst;
2829
class MemIntrinsic;
@@ -35,7 +36,7 @@ namespace VNCoercion {
3536
/// Return true if CoerceAvailableValueToLoadType would succeed if it was
3637
/// called.
3738
bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
38-
const DataLayout &DL);
39+
Function *F);
3940

4041
/// If we saw a store of a value to memory, and then a load from a must-aliased
4142
/// pointer of a different type, try to coerce the stored value to the loaded

llvm/lib/Transforms/Scalar/GVN.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1291,7 +1291,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
12911291

12921292
// If MD reported clobber, check it was nested.
12931293
if (DepInfo.isClobber() &&
1294-
canCoerceMustAliasedValueToLoad(DepLoad, LoadType, DL)) {
1294+
canCoerceMustAliasedValueToLoad(DepLoad, LoadType,
1295+
DepLoad->getFunction())) {
12951296
const auto ClobberOff = MD->getClobberOffset(DepLoad);
12961297
// GVN has no deal with a negative offset.
12971298
Offset = (ClobberOff == std::nullopt || *ClobberOff < 0)
@@ -1343,7 +1344,7 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
13431344
// different types if we have to. If the stored value is convertable to
13441345
// the loaded value, we can reuse it.
13451346
if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), Load->getType(),
1346-
DL))
1347+
S->getFunction()))
13471348
return std::nullopt;
13481349

13491350
// Can't forward from non-atomic to atomic without violating memory model.
@@ -1357,7 +1358,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
13571358
// If the types mismatch and we can't handle it, reject reuse of the load.
13581359
// If the stored value is larger or equal to the loaded value, we can reuse
13591360
// it.
1360-
if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(), DL))
1361+
if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(),
1362+
LD->getFunction()))
13611363
return std::nullopt;
13621364

13631365
// Can't forward from non-atomic to atomic without violating memory model.

llvm/lib/Transforms/Utils/VNCoercion.cpp

Lines changed: 61 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,32 +13,54 @@ static bool isFirstClassAggregateOrScalableType(Type *Ty) {
1313
return Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty);
1414
}
1515

16+
static std::optional<unsigned> getKnownVScale(Function *F) {
17+
const auto &Attrs = F->getAttributes().getFnAttrs();
18+
unsigned MinVScale = Attrs.getVScaleRangeMin();
19+
if (Attrs.getVScaleRangeMax() == MinVScale)
20+
return MinVScale;
21+
return std::nullopt;
22+
}
23+
1624
/// Return true if coerceAvailableValueToLoadType will succeed.
1725
bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
18-
const DataLayout &DL) {
26+
Function *F) {
1927
Type *StoredTy = StoredVal->getType();
20-
2128
if (StoredTy == LoadTy)
2229
return true;
2330

31+
const DataLayout &DL = F->getDataLayout();
2432
if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(LoadTy) &&
2533
DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy))
2634
return true;
2735

28-
// If the loaded/stored value is a first class array/struct, or scalable type,
29-
// don't try to transform them. We need to be able to bitcast to integer.
30-
if (isFirstClassAggregateOrScalableType(LoadTy) ||
31-
isFirstClassAggregateOrScalableType(StoredTy))
32-
return false;
33-
34-
uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedValue();
36+
// If the loaded/stored value is a first class array/struct, don't try to
37+
// transform them. We need to be able to bitcast to integer. For scalable
38+
// vectors forwarded to fixed-sized vectors with a compile-time known
39+
// vscale, @llvm.vector.extract is used.
40+
uint64_t StoreSize, LoadSize;
41+
if (isa<ScalableVectorType>(StoredTy) && isa<FixedVectorType>(LoadTy)) {
42+
std::optional<unsigned> VScale = getKnownVScale(F);
43+
if (!VScale || StoredTy->getScalarType() != LoadTy->getScalarType())
44+
return false;
45+
46+
StoreSize =
47+
DL.getTypeSizeInBits(StoredTy).getKnownMinValue() * VScale.value();
48+
LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedValue();
49+
} else {
50+
if (isFirstClassAggregateOrScalableType(LoadTy) ||
51+
isFirstClassAggregateOrScalableType(StoredTy))
52+
return false;
53+
54+
StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedValue();
55+
LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedValue();
56+
}
3557

3658
// The store size must be byte-aligned to support future type casts.
3759
if (llvm::alignTo(StoreSize, 8) != StoreSize)
3860
return false;
3961

4062
// The store has to be at least as big as the load.
41-
if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedValue())
63+
if (StoreSize < LoadSize)
4264
return false;
4365

4466
bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
@@ -57,11 +79,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
5779
return false;
5880
}
5981

60-
6182
// The implementation below uses inttoptr for vectors of unequal size; we
6283
// can't allow this for non integral pointers. We could teach it to extract
6384
// exact subvectors if desired.
64-
if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedValue())
85+
if (StoredNI && StoreSize != LoadSize)
6586
return false;
6687

6788
if (StoredTy->isTargetExtTy() || LoadTy->isTargetExtTy())
@@ -79,14 +100,23 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
79100
Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
80101
IRBuilderBase &Helper,
81102
const DataLayout &DL) {
82-
assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
103+
assert(canCoerceMustAliasedValueToLoad(
104+
StoredVal, LoadedTy, Helper.GetInsertBlock()->getParent()) &&
83105
"precondition violation - materialization can't fail");
84106
if (auto *C = dyn_cast<Constant>(StoredVal))
85107
StoredVal = ConstantFoldConstant(C, DL);
86108

87109
// If this is already the right type, just return it.
88110
Type *StoredValTy = StoredVal->getType();
89111

112+
// If this is a scalable vector forwarded to a fixed vector load, create
113+
// a @llvm.vector.extract instead of bitcasts.
114+
if (isa<ScalableVectorType>(StoredVal->getType()) &&
115+
isa<FixedVectorType>(LoadedTy)) {
116+
return Helper.CreateIntrinsic(LoadedTy, Intrinsic::vector_extract,
117+
{StoredVal, Helper.getInt64(0)});
118+
}
119+
90120
TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
91121
TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
92122

@@ -220,7 +250,7 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
220250
if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
221251
return -1;
222252

223-
if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
253+
if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DepSI->getFunction()))
224254
return -1;
225255

226256
Value *StorePtr = DepSI->getPointerOperand();
@@ -235,11 +265,11 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
235265
/// the other load can feed into the second load.
236266
int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
237267
const DataLayout &DL) {
238-
// Cannot handle reading from store of first-class aggregate yet.
239-
if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
268+
// Cannot handle reading from store of first-class aggregate or scalable type.
269+
if (isFirstClassAggregateOrScalableType(DepLI->getType()))
240270
return -1;
241271

242-
if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
272+
if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DepLI->getFunction()))
243273
return -1;
244274

245275
Value *DepPtr = DepLI->getPointerOperand();
@@ -315,6 +345,16 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
315345
return SrcVal;
316346
}
317347

348+
// For the case of a scalable vector beeing forwarded to a fixed-sized load,
349+
// only equal element types are allowed and a @llvm.vector.extract will be
350+
// used instead of bitcasts.
351+
if (isa<ScalableVectorType>(SrcVal->getType()) &&
352+
isa<FixedVectorType>(LoadTy)) {
353+
assert(Offset == 0 &&
354+
SrcVal->getType()->getScalarType() == LoadTy->getScalarType());
355+
return SrcVal;
356+
}
357+
318358
uint64_t StoreSize =
319359
(DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
320360
uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
@@ -348,6 +388,10 @@ Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
348388
#ifndef NDEBUG
349389
TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
350390
TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
391+
if (SrcValSize.isScalable() && !LoadSize.isScalable())
392+
SrcValSize =
393+
TypeSize::getFixed(SrcValSize.getKnownMinValue() *
394+
getKnownVScale(InsertPt->getFunction()).value());
351395
assert(SrcValSize.isScalable() == LoadSize.isScalable());
352396
assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
353397
"Expected Offset + LoadSize <= SrcValSize");

llvm/test/Transforms/GVN/vscale.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -648,9 +648,7 @@ define <vscale x 4 x float> @scalable_store_to_fixed_load(<vscale x 4 x float> %
648648
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
649649
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
650650
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
651-
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
652-
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
653-
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
651+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
654652
;
655653
entry:
656654
%retval = alloca { <16 x float> }

0 commit comments

Comments
 (0)