Skip to content

Commit c75b251

Browse files
authored
[GVN] Load-store forwaring of scalable store to fixed load. (#124748)
When storing a scalable vector and loading a fixed-size vector, where the scalable vector is known to be larger based on vscale_range, perform store-to-load forwarding through temporary @llvm.vector.extract calls. InstCombine then folds the insert/extract pair away. The usecase is shown in https://godbolt.org/z/KT3sMrMbd, which shows that clang generates IR that matches this pattern when the "arm_sve_vector_bits" attribute is used: ```c typedef svfloat32_t svfloat32_fixed_t __attribute__((arm_sve_vector_bits(512))); struct svfloat32_wrapped_t { svfloat32_fixed_t v; }; static inline svfloat32_wrapped_t add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { return {svadd_f32_x(svptrue_b32(), a.v, b.v)}; } svfloat32_wrapped_t foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { // The IR pattern this patch matches is generated for this return: return add(a, b); } ```
1 parent 3262863 commit c75b251

File tree

5 files changed

+327
-37
lines changed

5 files changed

+327
-37
lines changed

llvm/include/llvm/Transforms/Utils/VNCoercion.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
namespace llvm {
2525
class Constant;
26+
class Function;
2627
class StoreInst;
2728
class LoadInst;
2829
class MemIntrinsic;
@@ -35,7 +36,7 @@ namespace VNCoercion {
3536
/// Return true if CoerceAvailableValueToLoadType would succeed if it was
3637
/// called.
3738
bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
38-
const DataLayout &DL);
39+
Function *F);
3940

4041
/// If we saw a store of a value to memory, and then a load from a must-aliased
4142
/// pointer of a different type, try to coerce the stored value to the loaded
@@ -44,7 +45,7 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
4445
///
4546
/// If we can't do it, return null.
4647
Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
47-
IRBuilderBase &IRB, const DataLayout &DL);
48+
IRBuilderBase &IRB, Function *F);
4849

4950
/// This function determines whether a value for the pointer LoadPtr can be
5051
/// extracted from the store at DepSI.
@@ -75,7 +76,7 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
7576
/// It inserts instructions to do so at InsertPt, and returns the extracted
7677
/// value.
7778
Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
78-
Instruction *InsertPt, const DataLayout &DL);
79+
Instruction *InsertPt, Function *F);
7980
// This is the same as getValueForLoad, except it performs no insertion.
8081
// It only allows constant inputs.
8182
Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,

llvm/lib/Transforms/Scalar/GVN.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,7 +1096,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
10961096
if (isSimpleValue()) {
10971097
Res = getSimpleValue();
10981098
if (Res->getType() != LoadTy) {
1099-
Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
1099+
Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, Load->getFunction());
11001100

11011101
LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
11021102
<< " " << *getSimpleValue() << '\n'
@@ -1109,7 +1109,8 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
11091109
Res = CoercedLoad;
11101110
combineMetadataForCSE(CoercedLoad, Load, false);
11111111
} else {
1112-
Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt, DL);
1112+
Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt,
1113+
Load->getFunction());
11131114
// We are adding a new user for this load, for which the original
11141115
// metadata may not hold. Additionally, the new load may have a different
11151116
// size and type, so their metadata cannot be combined in any
@@ -1291,7 +1292,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
12911292

12921293
// If MD reported clobber, check it was nested.
12931294
if (DepInfo.isClobber() &&
1294-
canCoerceMustAliasedValueToLoad(DepLoad, LoadType, DL)) {
1295+
canCoerceMustAliasedValueToLoad(DepLoad, LoadType,
1296+
DepLoad->getFunction())) {
12951297
const auto ClobberOff = MD->getClobberOffset(DepLoad);
12961298
// GVN has no deal with a negative offset.
12971299
Offset = (ClobberOff == std::nullopt || *ClobberOff < 0)
@@ -1343,7 +1345,7 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
13431345
// different types if we have to. If the stored value is convertable to
13441346
// the loaded value, we can reuse it.
13451347
if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), Load->getType(),
1346-
DL))
1348+
S->getFunction()))
13471349
return std::nullopt;
13481350

13491351
// Can't forward from non-atomic to atomic without violating memory model.
@@ -1357,7 +1359,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
13571359
// If the types mismatch and we can't handle it, reject reuse of the load.
13581360
// If the stored value is larger or equal to the loaded value, we can reuse
13591361
// it.
1360-
if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(), DL))
1362+
if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(),
1363+
LD->getFunction()))
13611364
return std::nullopt;
13621365

13631366
// Can't forward from non-atomic to atomic without violating memory model.

llvm/lib/Transforms/Utils/VNCoercion.cpp

Lines changed: 63 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,30 +15,42 @@ static bool isFirstClassAggregateOrScalableType(Type *Ty) {
1515

1616
/// Return true if coerceAvailableValueToLoadType will succeed.
1717
bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
18-
const DataLayout &DL) {
18+
Function *F) {
1919
Type *StoredTy = StoredVal->getType();
20-
2120
if (StoredTy == LoadTy)
2221
return true;
2322

23+
const DataLayout &DL = F->getDataLayout();
24+
TypeSize MinStoreSize = DL.getTypeSizeInBits(StoredTy);
25+
TypeSize LoadSize = DL.getTypeSizeInBits(LoadTy);
2426
if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(LoadTy) &&
25-
DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy))
27+
MinStoreSize == LoadSize)
2628
return true;
2729

28-
// If the loaded/stored value is a first class array/struct, or scalable type,
29-
// don't try to transform them. We need to be able to bitcast to integer.
30-
if (isFirstClassAggregateOrScalableType(LoadTy) ||
31-
isFirstClassAggregateOrScalableType(StoredTy))
30+
// If the loaded/stored value is a first class array/struct, don't try to
31+
// transform them. We need to be able to bitcast to integer. For scalable
32+
// vectors forwarded to fixed-sized vectors @llvm.vector.extract is used.
33+
if (isa<ScalableVectorType>(StoredTy) && isa<FixedVectorType>(LoadTy)) {
34+
if (StoredTy->getScalarType() != LoadTy->getScalarType())
35+
return false;
36+
37+
// If it is known at compile-time that the VScale is larger than one,
38+
// use that information to allow for wider loads.
39+
const auto &Attrs = F->getAttributes().getFnAttrs();
40+
unsigned MinVScale = Attrs.getVScaleRangeMin();
41+
MinStoreSize =
42+
TypeSize::getFixed(MinStoreSize.getKnownMinValue() * MinVScale);
43+
} else if (isFirstClassAggregateOrScalableType(LoadTy) ||
44+
isFirstClassAggregateOrScalableType(StoredTy)) {
3245
return false;
33-
34-
uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedValue();
46+
}
3547

3648
// The store size must be byte-aligned to support future type casts.
37-
if (llvm::alignTo(StoreSize, 8) != StoreSize)
49+
if (llvm::alignTo(MinStoreSize, 8) != MinStoreSize)
3850
return false;
3951

4052
// The store has to be at least as big as the load.
41-
if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedValue())
53+
if (!TypeSize::isKnownGE(MinStoreSize, LoadSize))
4254
return false;
4355

4456
bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
@@ -57,11 +69,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
5769
return false;
5870
}
5971

60-
6172
// The implementation below uses inttoptr for vectors of unequal size; we
6273
// can't allow this for non integral pointers. We could teach it to extract
6374
// exact subvectors if desired.
64-
if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedValue())
75+
if (StoredNI && (StoredTy->isScalableTy() || MinStoreSize != LoadSize))
6576
return false;
6677

6778
if (StoredTy->isTargetExtTy() || LoadTy->isTargetExtTy())
@@ -77,16 +88,24 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
7788
///
7889
/// If we can't do it, return null.
7990
Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
80-
IRBuilderBase &Helper,
81-
const DataLayout &DL) {
82-
assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
91+
IRBuilderBase &Helper, Function *F) {
92+
assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, F) &&
8393
"precondition violation - materialization can't fail");
94+
const DataLayout &DL = F->getDataLayout();
8495
if (auto *C = dyn_cast<Constant>(StoredVal))
8596
StoredVal = ConstantFoldConstant(C, DL);
8697

8798
// If this is already the right type, just return it.
8899
Type *StoredValTy = StoredVal->getType();
89100

101+
// If this is a scalable vector forwarded to a fixed vector load, create
102+
// a @llvm.vector.extract instead of bitcasts.
103+
if (isa<ScalableVectorType>(StoredVal->getType()) &&
104+
isa<FixedVectorType>(LoadedTy)) {
105+
return Helper.CreateIntrinsic(LoadedTy, Intrinsic::vector_extract,
106+
{StoredVal, Helper.getInt64(0)});
107+
}
108+
90109
TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
91110
TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
92111

@@ -220,7 +239,7 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
220239
if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
221240
return -1;
222241

223-
if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
242+
if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DepSI->getFunction()))
224243
return -1;
225244

226245
Value *StorePtr = DepSI->getPointerOperand();
@@ -235,11 +254,11 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
235254
/// the other load can feed into the second load.
236255
int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
237256
const DataLayout &DL) {
238-
// Cannot handle reading from store of first-class aggregate yet.
239-
if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
257+
// Cannot handle reading from store of first-class aggregate or scalable type.
258+
if (isFirstClassAggregateOrScalableType(DepLI->getType()))
240259
return -1;
241260

242-
if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
261+
if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DepLI->getFunction()))
243262
return -1;
244263

245264
Value *DepPtr = DepLI->getPointerOperand();
@@ -315,6 +334,16 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
315334
return SrcVal;
316335
}
317336

337+
// For the case of a scalable vector being forwarded to a fixed-sized load,
338+
// only equal element types are allowed and a @llvm.vector.extract will be
339+
// used instead of bitcasts.
340+
if (isa<ScalableVectorType>(SrcVal->getType()) &&
341+
isa<FixedVectorType>(LoadTy)) {
342+
assert(Offset == 0 &&
343+
SrcVal->getType()->getScalarType() == LoadTy->getScalarType());
344+
return SrcVal;
345+
}
346+
318347
uint64_t StoreSize =
319348
(DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
320349
uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
@@ -344,20 +373,24 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
344373
}
345374

346375
Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
347-
Instruction *InsertPt, const DataLayout &DL) {
376+
Instruction *InsertPt, Function *F) {
377+
const DataLayout &DL = F->getDataLayout();
348378
#ifndef NDEBUG
349-
TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
379+
TypeSize MinSrcValSize = DL.getTypeStoreSize(SrcVal->getType());
350380
TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
351-
assert(SrcValSize.isScalable() == LoadSize.isScalable());
352-
assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
381+
if (MinSrcValSize.isScalable() && !LoadSize.isScalable())
382+
MinSrcValSize =
383+
TypeSize::getFixed(MinSrcValSize.getKnownMinValue() *
384+
F->getAttributes().getFnAttrs().getVScaleRangeMin());
385+
assert((MinSrcValSize.isScalable() || Offset + LoadSize <= MinSrcValSize) &&
353386
"Expected Offset + LoadSize <= SrcValSize");
354-
assert(
355-
(!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) &&
356-
"Expected scalable type sizes to match");
387+
assert((!MinSrcValSize.isScalable() ||
388+
(Offset == 0 && TypeSize::isKnownLE(LoadSize, MinSrcValSize))) &&
389+
"Expected offset of zero and LoadSize <= SrcValSize");
357390
#endif
358391
IRBuilder<> Builder(InsertPt);
359392
SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
360-
return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
393+
return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, F);
361394
}
362395

363396
Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,
@@ -408,7 +441,8 @@ Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
408441
++NumBytesSet;
409442
}
410443

411-
return coerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
444+
return coerceAvailableValueToLoadType(Val, LoadTy, Builder,
445+
InsertPt->getFunction());
412446
}
413447

414448
// Otherwise, this is a memcpy/memmove from a constant global.

llvm/test/Transforms/GVN/vscale.ll

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,3 +641,127 @@ entry:
641641
call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp)
642642
ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %15
643643
}
644+
645+
define <vscale x 4 x float> @scalable_store_to_fixed_load(<vscale x 4 x float> %.coerce) vscale_range(4,4) {
646+
; CHECK-LABEL: @scalable_store_to_fixed_load(
647+
; CHECK-NEXT: entry:
648+
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
649+
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
650+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
651+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
652+
;
653+
entry:
654+
%retval = alloca { <16 x float> }
655+
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
656+
store <vscale x 4 x float> %0, ptr %retval
657+
%1 = load <16 x float>, ptr %retval
658+
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
659+
ret <vscale x 4 x float> %cast.scalable
660+
}
661+
662+
; Here, only the lower bound for the vscale is known, but this is enough to allow a forward to a load to 16 elements.
663+
define <vscale x 4 x float> @scalable_store_to_fixed_load_only_lower_bound(<vscale x 4 x float> %a) vscale_range(4) {
664+
; CHECK-LABEL: @scalable_store_to_fixed_load_only_lower_bound(
665+
; CHECK-NEXT: entry:
666+
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 4 x float> }, align 16
667+
; CHECK-NEXT: store <vscale x 4 x float> [[A:%.*]], ptr [[RETVAL]], align 16
668+
; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
669+
;
670+
entry:
671+
%retval = alloca { <vscale x 4 x float> }
672+
store <vscale x 4 x float> %a, ptr %retval
673+
%1 = load <16 x float>, ptr %retval
674+
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
675+
ret <vscale x 4 x float> %cast.scalable
676+
}
677+
678+
define <vscale x 4 x float> @scalable_store_to_fixed_load_with_offset(<vscale x 4 x float> %a) vscale_range(4,4) {
679+
; CHECK-LABEL: @scalable_store_to_fixed_load_with_offset(
680+
; CHECK-NEXT: entry:
681+
; CHECK-NEXT: [[PTR:%.*]] = alloca { <32 x float> }, align 128
682+
; CHECK-NEXT: store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
683+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
684+
; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[GEP]], align 64
685+
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP0]], i64 0)
686+
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
687+
;
688+
entry:
689+
%ptr = alloca { <32 x float> }
690+
store <vscale x 4 x float> %a, ptr %ptr
691+
%gep = getelementptr inbounds i8, ptr %ptr, i64 8
692+
%1 = load <16 x float>, ptr %gep
693+
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
694+
ret <vscale x 4 x float> %cast.scalable
695+
}
696+
697+
define <vscale x 4 x float> @scalable_store_to_fixed_load_unknown_vscale(<vscale x 4 x float> %.coerce) {
698+
; CHECK-LABEL: @scalable_store_to_fixed_load_unknown_vscale(
699+
; CHECK-NEXT: entry:
700+
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
701+
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
702+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
703+
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
704+
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
705+
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
706+
;
707+
entry:
708+
%retval = alloca { <16 x float> }
709+
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
710+
store <vscale x 4 x float> %0, ptr %retval
711+
%1 = load <16 x float>, ptr %retval
712+
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
713+
ret <vscale x 4 x float> %cast.scalable
714+
}
715+
716+
define <vscale x 4 x float> @scalable_store_to_fixed_load_size_missmatch(<vscale x 4 x float> %.coerce) vscale_range(4,4) {
717+
; CHECK-LABEL: @scalable_store_to_fixed_load_size_missmatch(
718+
; CHECK-NEXT: entry:
719+
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <32 x float> }, align 128
720+
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
721+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
722+
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, ptr [[RETVAL]], align 128
723+
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> [[TMP1]], i64 0)
724+
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
725+
;
726+
entry:
727+
%retval = alloca { <32 x float> }
728+
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
729+
store <vscale x 4 x float> %0, ptr %retval
730+
%1 = load <32 x float>, ptr %retval
731+
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> %1, i64 0)
732+
ret <vscale x 4 x float> %cast.scalable
733+
}
734+
735+
define <vscale x 4 x i32> @scalable_store_to_fixed_load_different_types(<vscale x 4 x float> %a) vscale_range(4,4) {
736+
; CHECK-LABEL: @scalable_store_to_fixed_load_different_types(
737+
; CHECK-NEXT: entry:
738+
; CHECK-NEXT: [[PTR:%.*]] = alloca { <16 x float> }, align 64
739+
; CHECK-NEXT: store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
740+
; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr [[PTR]], align 64
741+
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TMP0]], i64 0)
742+
; CHECK-NEXT: ret <vscale x 4 x i32> [[CAST_SCALABLE]]
743+
;
744+
entry:
745+
%ptr = alloca { <16 x float> }
746+
store <vscale x 4 x float> %a, ptr %ptr
747+
%1 = load <16 x i32>, ptr %ptr
748+
%cast.scalable = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> %1, i64 0)
749+
ret <vscale x 4 x i32> %cast.scalable
750+
}
751+
752+
; This function does not have a fixed vscale, but the loaded vector is still known
753+
; to be smaller or equal in size compared to the stored vector.
754+
define <4 x float> @scalable_store_to_small_fixed_load(<vscale x 4 x float> %a) {
755+
; CHECK-LABEL: @scalable_store_to_small_fixed_load(
756+
; CHECK-NEXT: entry:
757+
; CHECK-NEXT: [[PTR:%.*]] = alloca <vscale x 4 x float>, align 16
758+
; CHECK-NEXT: store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
759+
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.nxv4f32(<vscale x 4 x float> [[A]], i64 0)
760+
; CHECK-NEXT: ret <4 x float> [[TMP0]]
761+
;
762+
entry:
763+
%ptr = alloca <vscale x 4 x float>
764+
store <vscale x 4 x float> %a, ptr %ptr
765+
%1 = load <4 x float>, ptr %ptr
766+
ret <4 x float> %1
767+
}

0 commit comments

Comments
 (0)