Skip to content

[GVN] Load-store forwaring of scalable store to fixed load. #124748

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions llvm/include/llvm/Transforms/Utils/VNCoercion.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

namespace llvm {
class Constant;
class Function;
class StoreInst;
class LoadInst;
class MemIntrinsic;
Expand All @@ -35,7 +36,7 @@ namespace VNCoercion {
/// Return true if CoerceAvailableValueToLoadType would succeed if it was
/// called.
bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
const DataLayout &DL);
Function *F);

/// If we saw a store of a value to memory, and then a load from a must-aliased
/// pointer of a different type, try to coerce the stored value to the loaded
Expand All @@ -44,7 +45,7 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
///
/// If we can't do it, return null.
Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
IRBuilderBase &IRB, const DataLayout &DL);
IRBuilderBase &IRB, Function *F);

/// This function determines whether a value for the pointer LoadPtr can be
/// extracted from the store at DepSI.
Expand Down Expand Up @@ -75,7 +76,7 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
/// It inserts instructions to do so at InsertPt, and returns the extracted
/// value.
Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
Instruction *InsertPt, const DataLayout &DL);
Instruction *InsertPt, Function *F);
// This is the same as getValueForLoad, except it performs no insertion.
// It only allows constant inputs.
Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,
Expand Down
13 changes: 8 additions & 5 deletions llvm/lib/Transforms/Scalar/GVN.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
if (isSimpleValue()) {
Res = getSimpleValue();
if (Res->getType() != LoadTy) {
Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, Load->getFunction());

LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
<< " " << *getSimpleValue() << '\n'
Expand All @@ -1109,7 +1109,8 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
Res = CoercedLoad;
combineMetadataForCSE(CoercedLoad, Load, false);
} else {
Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt, DL);
Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt,
Load->getFunction());
// We are adding a new user for this load, for which the original
// metadata may not hold. Additionally, the new load may have a different
// size and type, so their metadata cannot be combined in any
Expand Down Expand Up @@ -1291,7 +1292,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,

// If MD reported clobber, check it was nested.
if (DepInfo.isClobber() &&
canCoerceMustAliasedValueToLoad(DepLoad, LoadType, DL)) {
canCoerceMustAliasedValueToLoad(DepLoad, LoadType,
DepLoad->getFunction())) {
const auto ClobberOff = MD->getClobberOffset(DepLoad);
// GVN has no deal with a negative offset.
Offset = (ClobberOff == std::nullopt || *ClobberOff < 0)
Expand Down Expand Up @@ -1343,7 +1345,7 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
// different types if we have to. If the stored value is convertable to
// the loaded value, we can reuse it.
if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), Load->getType(),
DL))
S->getFunction()))
return std::nullopt;

// Can't forward from non-atomic to atomic without violating memory model.
Expand All @@ -1357,7 +1359,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
// If the types mismatch and we can't handle it, reject reuse of the load.
// If the stored value is larger or equal to the loaded value, we can reuse
// it.
if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(), DL))
if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(),
LD->getFunction()))
return std::nullopt;

// Can't forward from non-atomic to atomic without violating memory model.
Expand Down
92 changes: 63 additions & 29 deletions llvm/lib/Transforms/Utils/VNCoercion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,42 @@ static bool isFirstClassAggregateOrScalableType(Type *Ty) {

/// Return true if coerceAvailableValueToLoadType will succeed.
bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
const DataLayout &DL) {
Function *F) {
Type *StoredTy = StoredVal->getType();

if (StoredTy == LoadTy)
return true;

const DataLayout &DL = F->getDataLayout();
TypeSize MinStoreSize = DL.getTypeSizeInBits(StoredTy);
TypeSize LoadSize = DL.getTypeSizeInBits(LoadTy);
if (isa<ScalableVectorType>(StoredTy) && isa<ScalableVectorType>(LoadTy) &&
DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy))
MinStoreSize == LoadSize)
return true;

// If the loaded/stored value is a first class array/struct, or scalable type,
// don't try to transform them. We need to be able to bitcast to integer.
if (isFirstClassAggregateOrScalableType(LoadTy) ||
isFirstClassAggregateOrScalableType(StoredTy))
// If the loaded/stored value is a first class array/struct, don't try to
// transform them. We need to be able to bitcast to integer. For scalable
// vectors forwarded to fixed-sized vectors @llvm.vector.extract is used.
if (isa<ScalableVectorType>(StoredTy) && isa<FixedVectorType>(LoadTy)) {
if (StoredTy->getScalarType() != LoadTy->getScalarType())
return false;

// If it is known at compile-time that the VScale is larger than one,
// use that information to allow for wider loads.
const auto &Attrs = F->getAttributes().getFnAttrs();
unsigned MinVScale = Attrs.getVScaleRangeMin();
MinStoreSize =
TypeSize::getFixed(MinStoreSize.getKnownMinValue() * MinVScale);
} else if (isFirstClassAggregateOrScalableType(LoadTy) ||
isFirstClassAggregateOrScalableType(StoredTy)) {
return false;

uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedValue();
}

// The store size must be byte-aligned to support future type casts.
if (llvm::alignTo(StoreSize, 8) != StoreSize)
if (llvm::alignTo(MinStoreSize, 8) != MinStoreSize)
return false;

// The store has to be at least as big as the load.
if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedValue())
if (!TypeSize::isKnownGE(MinStoreSize, LoadSize))
return false;

bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
Expand All @@ -57,11 +69,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
return false;
}


// The implementation below uses inttoptr for vectors of unequal size; we
// can't allow this for non integral pointers. We could teach it to extract
// exact subvectors if desired.
if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedValue())
if (StoredNI && (StoredTy->isScalableTy() || MinStoreSize != LoadSize))
return false;

if (StoredTy->isTargetExtTy() || LoadTy->isTargetExtTy())
Expand All @@ -77,16 +88,24 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
///
/// If we can't do it, return null.
Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
IRBuilderBase &Helper,
const DataLayout &DL) {
assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
IRBuilderBase &Helper, Function *F) {
assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, F) &&
"precondition violation - materialization can't fail");
const DataLayout &DL = F->getDataLayout();
if (auto *C = dyn_cast<Constant>(StoredVal))
StoredVal = ConstantFoldConstant(C, DL);

// If this is already the right type, just return it.
Type *StoredValTy = StoredVal->getType();

// If this is a scalable vector forwarded to a fixed vector load, create
// a @llvm.vector.extract instead of bitcasts.
if (isa<ScalableVectorType>(StoredVal->getType()) &&
isa<FixedVectorType>(LoadedTy)) {
return Helper.CreateIntrinsic(LoadedTy, Intrinsic::vector_extract,
{StoredVal, Helper.getInt64(0)});
}

TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy);
TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy);

Expand Down Expand Up @@ -220,7 +239,7 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
return -1;

if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DepSI->getFunction()))
return -1;

Value *StorePtr = DepSI->getPointerOperand();
Expand All @@ -235,11 +254,11 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
/// the other load can feed into the second load.
int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
const DataLayout &DL) {
// Cannot handle reading from store of first-class aggregate yet.
if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
// Cannot handle reading from store of first-class aggregate or scalable type.
if (isFirstClassAggregateOrScalableType(DepLI->getType()))
return -1;

if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DepLI->getFunction()))
return -1;

Value *DepPtr = DepLI->getPointerOperand();
Expand Down Expand Up @@ -315,6 +334,16 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
return SrcVal;
}

// For the case of a scalable vector being forwarded to a fixed-sized load,
// only equal element types are allowed and a @llvm.vector.extract will be
// used instead of bitcasts.
if (isa<ScalableVectorType>(SrcVal->getType()) &&
isa<FixedVectorType>(LoadTy)) {
assert(Offset == 0 &&
SrcVal->getType()->getScalarType() == LoadTy->getScalarType());
return SrcVal;
}

uint64_t StoreSize =
(DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8;
uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8;
Expand Down Expand Up @@ -344,20 +373,24 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset,
}

Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
Instruction *InsertPt, const DataLayout &DL) {
Instruction *InsertPt, Function *F) {
const DataLayout &DL = F->getDataLayout();
#ifndef NDEBUG
TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
TypeSize MinSrcValSize = DL.getTypeStoreSize(SrcVal->getType());
TypeSize LoadSize = DL.getTypeStoreSize(LoadTy);
assert(SrcValSize.isScalable() == LoadSize.isScalable());
assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) &&
if (MinSrcValSize.isScalable() && !LoadSize.isScalable())
MinSrcValSize =
TypeSize::getFixed(MinSrcValSize.getKnownMinValue() *
F->getAttributes().getFnAttrs().getVScaleRangeMin());
assert((MinSrcValSize.isScalable() || Offset + LoadSize <= MinSrcValSize) &&
"Expected Offset + LoadSize <= SrcValSize");
assert(
(!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) &&
"Expected scalable type sizes to match");
assert((!MinSrcValSize.isScalable() ||
(Offset == 0 && TypeSize::isKnownLE(LoadSize, MinSrcValSize))) &&
"Expected offset of zero and LoadSize <= SrcValSize");
#endif
IRBuilder<> Builder(InsertPt);
SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, F);
}

Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset,
Expand Down Expand Up @@ -408,7 +441,8 @@ Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
++NumBytesSet;
}

return coerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
return coerceAvailableValueToLoadType(Val, LoadTy, Builder,
InsertPt->getFunction());
}

// Otherwise, this is a memcpy/memmove from a constant global.
Expand Down
124 changes: 124 additions & 0 deletions llvm/test/Transforms/GVN/vscale.ll
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to add a negative test where the load has an extra constant offset, so make sure we don't forward in that case.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Different types might be useful too. For example a scalable_store_to_fixed_load with <vscale x 4 x float> input but <vscale x 4 x i32> output for example.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added one just now (in the existing test file).

Original file line number Diff line number Diff line change
Expand Up @@ -641,3 +641,127 @@ entry:
call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp)
ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %15
}

define <vscale x 4 x float> @scalable_store_to_fixed_load(<vscale x 4 x float> %.coerce) vscale_range(4,4) {
; CHECK-LABEL: @scalable_store_to_fixed_load(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
;
entry:
%retval = alloca { <16 x float> }
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
store <vscale x 4 x float> %0, ptr %retval
%1 = load <16 x float>, ptr %retval
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
ret <vscale x 4 x float> %cast.scalable
}

; Here, only the lower bound for the vscale is known, but this is enough to allow a forward to a load to 16 elements.
define <vscale x 4 x float> @scalable_store_to_fixed_load_only_lower_bound(<vscale x 4 x float> %a) vscale_range(4) {
; CHECK-LABEL: @scalable_store_to_fixed_load_only_lower_bound(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 4 x float> }, align 16
; CHECK-NEXT: store <vscale x 4 x float> [[A:%.*]], ptr [[RETVAL]], align 16
; CHECK-NEXT: ret <vscale x 4 x float> [[A]]
;
entry:
%retval = alloca { <vscale x 4 x float> }
store <vscale x 4 x float> %a, ptr %retval
%1 = load <16 x float>, ptr %retval
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
ret <vscale x 4 x float> %cast.scalable
}

define <vscale x 4 x float> @scalable_store_to_fixed_load_with_offset(<vscale x 4 x float> %a) vscale_range(4,4) {
; CHECK-LABEL: @scalable_store_to_fixed_load_with_offset(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[PTR:%.*]] = alloca { <32 x float> }, align 128
; CHECK-NEXT: store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[GEP]], align 64
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP0]], i64 0)
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
;
entry:
%ptr = alloca { <32 x float> }
store <vscale x 4 x float> %a, ptr %ptr
%gep = getelementptr inbounds i8, ptr %ptr, i64 8
%1 = load <16 x float>, ptr %gep
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
ret <vscale x 4 x float> %cast.scalable
}

define <vscale x 4 x float> @scalable_store_to_fixed_load_unknown_vscale(<vscale x 4 x float> %.coerce) {
; CHECK-LABEL: @scalable_store_to_fixed_load_unknown_vscale(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
;
entry:
%retval = alloca { <16 x float> }
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
store <vscale x 4 x float> %0, ptr %retval
%1 = load <16 x float>, ptr %retval
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
ret <vscale x 4 x float> %cast.scalable
}

define <vscale x 4 x float> @scalable_store_to_fixed_load_size_missmatch(<vscale x 4 x float> %.coerce) vscale_range(4,4) {
; CHECK-LABEL: @scalable_store_to_fixed_load_size_missmatch(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <32 x float> }, align 128
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE:%.*]], [[DOTCOERCE]]
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, ptr [[RETVAL]], align 128
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> [[TMP1]], i64 0)
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
;
entry:
%retval = alloca { <32 x float> }
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
store <vscale x 4 x float> %0, ptr %retval
%1 = load <32 x float>, ptr %retval
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> %1, i64 0)
ret <vscale x 4 x float> %cast.scalable
}

define <vscale x 4 x i32> @scalable_store_to_fixed_load_different_types(<vscale x 4 x float> %a) vscale_range(4,4) {
; CHECK-LABEL: @scalable_store_to_fixed_load_different_types(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[PTR:%.*]] = alloca { <16 x float> }, align 64
; CHECK-NEXT: store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr [[PTR]], align 64
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TMP0]], i64 0)
; CHECK-NEXT: ret <vscale x 4 x i32> [[CAST_SCALABLE]]
;
entry:
%ptr = alloca { <16 x float> }
store <vscale x 4 x float> %a, ptr %ptr
%1 = load <16 x i32>, ptr %ptr
%cast.scalable = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> %1, i64 0)
ret <vscale x 4 x i32> %cast.scalable
}

; This function does not have a fixed vscale, but the loaded vector is still known
; to be smaller or equal in size compared to the stored vector.
define <4 x float> @scalable_store_to_small_fixed_load(<vscale x 4 x float> %a) {
; CHECK-LABEL: @scalable_store_to_small_fixed_load(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[PTR:%.*]] = alloca <vscale x 4 x float>, align 16
; CHECK-NEXT: store <vscale x 4 x float> [[A:%.*]], ptr [[PTR]], align 16
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.nxv4f32(<vscale x 4 x float> [[A]], i64 0)
; CHECK-NEXT: ret <4 x float> [[TMP0]]
;
entry:
%ptr = alloca <vscale x 4 x float>
store <vscale x 4 x float> %a, ptr %ptr
%1 = load <4 x float>, ptr %ptr
ret <4 x float> %1
}
Loading