Skip to content

Commit 7dfc50a

Browse files
committed
[GVN] Load-store forwaring of scalable store to fixed load.
When storing a scalable vector and the vscale is a compile-time known constant, store-to-load forwarding through temporary @llvm.vector.extract calls, even if the loaded vector is fixed-sized instead of scalable. InstCombine then folds the insert/extract pair away. The usecase is shown in this [godbold link](https://godbolt.org/z/KT3sMrMbd), which shows that clang generates IR that matches this pattern when the "arm_sve_vector_bits" attribute is used: ```c typedef svfloat32_t svfloat32_fixed_t __attribute__((arm_sve_vector_bits(512))); struct svfloat32_wrapped_t { svfloat32_fixed_t v; }; static inline svfloat32_wrapped_t add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { return {svadd_f32_x(svptrue_b32(), a.v, b.v)}; } svfloat32_wrapped_t foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { // The IR pattern this patch matches is generated for this return: return add(a, b); } ```
1 parent 3ce8db3 commit 7dfc50a

File tree

4 files changed

+82
-38
lines changed

4 files changed

+82
-38
lines changed

llvm/include/llvm/Transforms/Utils/VNCoercion.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
namespace llvm {
2525
class Constant;
26+
class Function;
2627
class StoreInst;
2728
class LoadInst;
2829
class MemIntrinsic;
@@ -35,7 +36,7 @@ namespace VNCoercion {
3536
/// Return true if CoerceAvailableValueToLoadType would succeed if it was
3637
/// called.
3738
bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
38-
const DataLayout &DL);
39+
Function *F);
3940

4041
/// If we saw a store of a value to memory, and then a load from a must-aliased
4142
/// pointer of a different type, try to coerce the stored value to the loaded
@@ -44,7 +45,7 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
4445
///
4546
/// If we can't do it, return null.
4647
Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
47-
IRBuilderBase &IRB, const DataLayout &DL);
48+
IRBuilderBase &IRB, Function *F);
4849

4950
/// This function determines whether a value for the pointer LoadPtr can be
5051
/// extracted from the store at DepSI.
@@ -75,7 +76,7 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
7576
/// It inserts instructions to do so at InsertPt, and returns the extracted
7677
/// value.
7778
Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
78-
Instruction *InsertPt, const DataLayout &DL);
79+
Instruction *InsertPt, Function *F);
7980
// This is the same as getValueForLoad, except it performs no insertion.
8081
// It only allows constant inputs.
8182
Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,

llvm/lib/Transforms/Scalar/GVN.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,7 +1096,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
10961096
if (isSimpleValue()) {
10971097
Res = getSimpleValue();
10981098
if (Res->getType() != LoadTy) {
1099-
Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
1099+
Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, Load->getFunction());
11001100

11011101
LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
11021102
<< " " << *getSimpleValue() << '\n'
@@ -1109,7 +1109,8 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
11091109
Res = CoercedLoad;
11101110
combineMetadataForCSE(CoercedLoad, Load, false);
11111111
} else {
1112-
Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt, DL);
1112+
Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt,
1113+
Load->getFunction());
11131114
// We are adding a new user for this load, for which the original
11141115
// metadata may not hold. Additionally, the new load may have a different
11151116
// size and type, so their metadata cannot be combined in any
@@ -1291,7 +1292,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
12911292

12921293
// If MD reported clobber, check it was nested.
12931294
if (DepInfo.isClobber() &&
1294-
canCoerceMustAliasedValueToLoad(DepLoad, LoadType, DL)) {
1295+
canCoerceMustAliasedValueToLoad(DepLoad, LoadType,
1296+
DepLoad->getFunction())) {
12951297
const auto ClobberOff = MD->getClobberOffset(DepLoad);
12961298
// GVN has no deal with a negative offset.
12971299
Offset = (ClobberOff == std::nullopt || *ClobberOff < 0)
@@ -1343,7 +1345,7 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
13431345
// different types if we have to. If the stored value is convertable to
13441346
// the loaded value, we can reuse it.
13451347
if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), Load->getType(),
1346-
DL))
1348+
S->getFunction()))
13471349
return std::nullopt;
13481350

13491351
// Can't forward from non-atomic to atomic without violating memory model.
@@ -1357,7 +1359,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
13571359
// If the types mismatch and we can't handle it, reject reuse of the load.
13581360
// If the stored value is larger or equal to the loaded value, we can reuse
13591361
// it.
1360-
if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(), DL))
1362+
if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(),
1363+
LD->getFunction()))
13611364
return std::nullopt;
13621365

13631366
// Can't forward from non-atomic to atomic without violating memory model.

llvm/lib/Transforms/Utils/VNCoercion.cpp

Lines changed: 68 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13,32 +13,52 @@ static bool isFirstClassAggregateOrScalableType(Type *Ty) {
1313
return Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty);
1414
}
1515

16+
static std::optional<unsigned> getKnownVScale(Function *F) {
17+
const auto &Attrs = F->getAttributes().getFnAttrs();
18+
unsigned MinVScale = Attrs.getVScaleRangeMin();
19+
if (Attrs.getVScaleRangeMax() == MinVScale)
20+
return MinVScale;
21+
return std::nullopt;
22+
}
23+
1624
/// Return true if coerceAvailableValueToLoadType will succeed.
1725
bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
18-
const DataLayout &DL) {
26+
Function *F) {
1927
Type *StoredTy = StoredVal->getType();
20-
2128
if (StoredTy == LoadTy)
2229
return true;
2330

31+
const DataLayout &DL = F->getDataLayout();
32+
TypeSize StoreSize = DL.getTypeSizeInBits(StoredTy);
33+
TypeSize LoadSize = DL.getTypeSizeInBits(LoadTy);
2434
if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(LoadTy) &&
25-
DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy))
35+
StoreSize == LoadSize)
2636
return true;
2737

28-
// If the loaded/stored value is a first class array/struct, or scalable type,
29-
// don't try to transform them. We need to be able to bitcast to integer.
30-
if (isFirstClassAggregateOrScalableType(LoadTy) ||
31-
isFirstClassAggregateOrScalableType(StoredTy))
38+
// If the loaded/stored value is a first class array/struct, don't try to
39+
// transform them. We need to be able to bitcast to integer. For scalable
40+
// vectors forwarded to fixed-sized vectors @llvm.vector.extract is used.
41+
if (isa<ScalableVectorType>(StoredTy) && isa<FixedVectorType>(LoadTy)) {
42+
if (StoredTy->getScalarType() != LoadTy->getScalarType())
43+
return false;
44+
45+
// If the VScale is known at compile-time, use that information to
46+
// allow for wider loads.
47+
std::optional<unsigned> VScale = getKnownVScale(F);
48+
if (VScale)
49+
StoreSize =
50+
TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale.value());
51+
} else if (isFirstClassAggregateOrScalableType(LoadTy) ||
52+
isFirstClassAggregateOrScalableType(StoredTy)) {
3253
return false;
33-
34-
uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedValue();
54+
}
3555

3656
// The store size must be byte-aligned to support future type casts.
3757
if (llvm::alignTo(StoreSize, 8) != StoreSize)
3858
return false;
3959

4060
// The store has to be at least as big as the load.
41-
if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedValue())
61+
if (!TypeSize::isKnownGE(StoreSize, LoadSize))
4262
return false;
4363

4464
bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
@@ -57,11 +77,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
5777
return false;
5878
}
5979

60-
6180
// The implementation below uses inttoptr for vectors of unequal size; we
6281
// can't allow this for non integral pointers. We could teach it to extract
6382
// exact subvectors if desired.
64-
if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedValue())
83+
if (StoredNI && StoreSize != LoadSize)
6584
return false;
6685

6786
if (StoredTy->isTargetExtTy() || LoadTy->isTargetExtTy())
@@ -77,16 +96,24 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
7796
///
7897
/// If we can't do it, return null.
7998
Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
80-
IRBuilderBase &Helper,
81-
const DataLayout &DL) {
82-
assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
99+
IRBuilderBase &Helper, Function *F) {
100+
assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, F) &&
83101
"precondition violation - materialization can't fail");
102+
const DataLayout &DL = F->getDataLayout();
84103
if (auto *C = dyn_cast<Constant>(StoredVal))
85104
StoredVal = ConstantFoldConstant(C, DL);
86105

87106
// If this is already the right type, just return it.
88107
Type *StoredValTy = StoredVal->getType();
89108

109+
// If this is a scalable vector forwarded to a fixed vector load, create
110+
// a @llvm.vector.extract instead of bitcasts.
111+
if (isa<ScalableVectorType>(StoredVal->getType()) &&
112+
isa<FixedVectorType>(LoadedTy)) {
113+
return Helper.CreateIntrinsic(LoadedTy, Intrinsic::vector_extract,
114+
{StoredVal, Helper.getInt64(0)});
115+
}
116+
90117
TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
91118
TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
92119

@@ -220,7 +247,7 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
220247
if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
221248
return -1;
222249

223-
if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
250+
if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DepSI->getFunction()))
224251
return -1;
225252

226253
Value *StorePtr = DepSI->getPointerOperand();
@@ -235,11 +262,11 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
235262
/// the other load can feed into the second load.
236263
int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
237264
const DataLayout &DL) {
238-
// Cannot handle reading from store of first-class aggregate yet.
239-
if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
265+
// Cannot handle reading from store of first-class aggregate or scalable type.
266+
if (isFirstClassAggregateOrScalableType(DepLI->getType()))
240267
return -1;
241268

242-
if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
269+
if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DepLI->getFunction()))
243270
return -1;
244271

245272
Value *DepPtr = DepLI->getPointerOperand();
@@ -315,6 +342,16 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
315342
return SrcVal;
316343
}
317344

345+
// For the case of a scalable vector beeing forwarded to a fixed-sized load,
346+
// only equal element types are allowed and a @llvm.vector.extract will be
347+
// used instead of bitcasts.
348+
if (isa<ScalableVectorType>(SrcVal->getType()) &&
349+
isa<FixedVectorType>(LoadTy)) {
350+
assert(Offset == 0 &&
351+
SrcVal->getType()->getScalarType() == LoadTy->getScalarType());
352+
return SrcVal;
353+
}
354+
318355
uint64_t StoreSize =
319356
(DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
320357
uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
@@ -344,20 +381,24 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
344381
}
345382

346383
Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
347-
Instruction *InsertPt, const DataLayout &DL) {
384+
Instruction *InsertPt, Function *F) {
385+
const DataLayout &DL = F->getDataLayout();
348386
#ifndef NDEBUG
349387
TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
350388
TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
351-
assert(SrcValSize.isScalable() == LoadSize.isScalable());
389+
if (auto VScale = getKnownVScale(InsertPt->getFunction());
390+
VScale && SrcValSize.isScalable() && !LoadSize.isScalable())
391+
SrcValSize =
392+
TypeSize::getFixed(SrcValSize.getKnownMinValue() * VScale.value());
352393
assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
353394
"Expected Offset + LoadSize <= SrcValSize");
354-
assert(
355-
(!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) &&
356-
"Expected scalable type sizes to match");
395+
assert((!SrcValSize.isScalable() ||
396+
(Offset == 0 && TypeSize::isKnownLE(LoadSize, SrcValSize))) &&
397+
"Expected offset of zero and LoadSize <= SrcValSize");
357398
#endif
358399
IRBuilder<> Builder(InsertPt);
359400
SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
360-
return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
401+
return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, F);
361402
}
362403

363404
Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,
@@ -408,7 +449,8 @@ Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
408449
++NumBytesSet;
409450
}
410451

411-
return coerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
452+
return coerceAvailableValueToLoadType(Val, LoadTy, Builder,
453+
InsertPt->getFunction());
412454
}
413455

414456
// Otherwise, this is a memcpy/memmove from a constant global.

llvm/test/Transforms/GVN/vscale.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -648,9 +648,7 @@ define <vscale x 4 x float> @scalable_store_to_fixed_load(<vscale x 4 x float> %
648648
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
649649
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
650650
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
651-
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
652-
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
653-
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
651+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
654652
;
655653
entry:
656654
%retval = alloca { <16 x float> }
@@ -725,7 +723,7 @@ define <4 x float> @scalable_store_to_small_fixed_load(<vscale x 4 x float> %a)
725723
; CHECK-NEXT: entry:
726724
; CHECK-NEXT: [[PTR:%.*]] = alloca <vscale x 4 x float>, align 16
727725
; CHECK-NEXT: store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
728-
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[PTR]], align 16
726+
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.nxv4f32(<vscale x 4 x float> [[A]], i64 0)
729727
; CHECK-NEXT: ret <4 x float> [[TMP0]]
730728
;
731729
entry:

0 commit comments

Comments
 (0)