Skip to content

[InstCombine] Simple store-to-load forwaring between fixed/scalable vectors #124577

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/include/llvm/Analysis/Loads.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,12 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
/// FindAvailableLoadedValue() for the case where we are not interested in
/// finding the closest clobbering instruction if no available load is found.
/// This overload cannot be used to scan across multiple blocks.
/// If \p VectorKindChange is not nullptr, this is a out parameter that is true
/// if a value was found, but it is a scalable vector instead of a requested
/// fixed-sized one (or the other way round).
Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
bool *IsLoadCSE,
bool *IsVectorKindChange = nullptr,
unsigned MaxInstsToScan = DefMaxInstsToScan);

/// Scan backwards to see if we have the value of the given pointer available
Expand Down
33 changes: 27 additions & 6 deletions llvm/lib/Analysis/Loads.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,8 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr,

static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
Type *AccessTy, bool AtLeastAtomic,
const DataLayout &DL, bool *IsLoadCSE) {
const DataLayout &DL, bool *IsLoadCSE,
bool *IsVectorKindChange) {
// If this is a load of Ptr, the loaded value is available.
// (This is true even if the load is volatile or atomic, although
// those cases are unlikely.)
Expand Down Expand Up @@ -584,6 +585,25 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
if (TypeSize::isKnownLE(LoadSize, StoreSize))
if (auto *C = dyn_cast<Constant>(Val))
return ConstantFoldLoadFromConst(C, AccessTy, DL);

if (IsVectorKindChange && Val->getType()->isVectorTy() &&
AccessTy->isVectorTy()) {
auto Attrs = Inst->getFunction()->getAttributes().getFnAttrs();
unsigned VScale = Attrs.getVScaleRangeMin();
if (Attrs.getVScaleRangeMax() != VScale)
return nullptr;

unsigned FixedStoreSize =
(StoreSize.isFixed() ? StoreSize : StoreSize * VScale)
.getKnownMinValue();
unsigned FixedLoadSize =
(LoadSize.isFixed() ? LoadSize : LoadSize * VScale)
.getKnownMinValue();
if (FixedStoreSize == FixedLoadSize) {
*IsVectorKindChange = true;
return Val;
}
}
}

if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
Expand Down Expand Up @@ -655,8 +675,8 @@ Value *llvm::findAvailablePtrLoadStore(

--ScanFrom;

if (Value *Available = getAvailableLoadStore(Inst, StrippedPtr, AccessTy,
AtLeastAtomic, DL, IsLoadCSE))
if (Value *Available = getAvailableLoadStore(
Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, IsLoadCSE, nullptr))
return Available;

// Try to get the store size for the type.
Expand Down Expand Up @@ -711,7 +731,7 @@ Value *llvm::findAvailablePtrLoadStore(
}

Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
bool *IsLoadCSE,
bool *IsLoadCSE, bool *IsVectorKindChange,
unsigned MaxInstsToScan) {
const DataLayout &DL = Load->getDataLayout();
Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
Expand All @@ -734,8 +754,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
if (MaxInstsToScan-- == 0)
return nullptr;

Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
AtLeastAtomic, DL, IsLoadCSE);
Available =
getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL,
IsLoadCSE, IsVectorKindChange);
if (Available)
break;

Expand Down
33 changes: 25 additions & 8 deletions llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3389,17 +3389,34 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
Value *Vec = II->getArgOperand(0);
Value *SubVec = II->getArgOperand(1);
Value *Idx = II->getArgOperand(2);
auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
auto *DstTy = cast<VectorType>(II->getType());
auto *VecTy = cast<VectorType>(Vec->getType());
auto *SubVecTy = cast<VectorType>(SubVec->getType());
unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();

// Try store-to-load forwarding where the stored value has the same
// type as this intrinsic, and the loaded value is the inserted
// vector. This has to be done here because a temporary insert of
// a scalable vector (the available value) into a fixed-sized one
// (the second operand of this intrinisc) cannot be created.
if (auto *LI = dyn_cast<LoadInst>(SubVec);
LI && IdxN == 0 && DstTy->isScalableTy() && !SubVecTy->isScalableTy()) {
bool IsVectorKindChange = false;
BatchAAResults BatchAA(*AA);
if (Value *AvilVal = FindAvailableLoadedValue(LI, BatchAA, nullptr,
&IsVectorKindChange);
AvilVal && IsVectorKindChange && AvilVal->getType() == DstTy) {
return replaceInstUsesWith(CI, AvilVal);
}
}

// Only canonicalize if the destination vector, Vec, and SubVec are all
// fixed vectors.
if (DstTy && VecTy && SubVecTy) {
unsigned DstNumElts = DstTy->getNumElements();
unsigned VecNumElts = VecTy->getNumElements();
unsigned SubVecNumElts = SubVecTy->getNumElements();
unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
if (!DstTy->isScalableTy() && !VecTy->isScalableTy() &&
!SubVecTy->isScalableTy()) {
unsigned DstNumElts = DstTy->getElementCount().getFixedValue();
unsigned VecNumElts = VecTy->getElementCount().getFixedValue();
unsigned SubVecNumElts = SubVecTy->getElementCount().getFixedValue();

// An insert that entirely overwrites Vec with SubVec is a nop.
if (VecNumElts == SubVecNumElts)
Expand Down
66 changes: 66 additions & 0 deletions llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes=instcombine < %s | FileCheck %s

%struct.svfloat32_wrapped_t = type { <16 x float> }

define <vscale x 4 x float> @store_to_vector_load_different_type(<vscale x 4 x float> %.coerce) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @store_to_vector_load_different_type(
; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
;
entry:
%retval = alloca %struct.svfloat32_wrapped_t
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
store <vscale x 4 x float> %0, ptr %retval
%1 = load <16 x float>, ptr %retval
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
ret <vscale x 4 x float> %cast.scalable
}

define <vscale x 4 x float> @vscale_not_fixed(<vscale x 4 x float> %.coerce) #1 {
; CHECK-LABEL: define <vscale x 4 x float> @vscale_not_fixed(
; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
;
entry:
%retval = alloca %struct.svfloat32_wrapped_t
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
store <vscale x 4 x float> %0, ptr %retval
%1 = load <16 x float>, ptr %retval
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
ret <vscale x 4 x float> %cast.scalable
}

define <vscale x 4 x float> @sizes_do_not_match(<vscale x 4 x float> %.coerce) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @sizes_do_not_match(
; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[RETVAL]], align 32
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> poison, <8 x float> [[TMP1]], i64 0)
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
;
entry:
%retval = alloca %struct.svfloat32_wrapped_t
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
store <vscale x 4 x float> %0, ptr %retval
%1 = load <8 x float>, ptr %retval
%cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> poison, <8 x float> %1, i64 0)
ret <vscale x 4 x float> %cast.scalable
}

declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float>, <16 x float>, i64 immarg)
declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float>, <8 x float>, i64 immarg)

attributes #0 = { vscale_range(4,4) }
attributes #1 = { vscale_range(1,16) }
Loading