Skip to content

Commit aea8f8b

Browse files
authored
Revert "Revert "[llvm][ARM]Add widen global arrays pass" (#112701)"
This reverts commit 370fd74.
1 parent 83c6e2f commit aea8f8b

18 files changed

+589
-0
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,6 +1819,10 @@ class TargetTransformInfo {
18191819
/// \return The maximum number of function arguments the target supports.
18201820
unsigned getMaxNumArgs() const;
18211821

1822+
/// \return For an array of given Size, return alignment boundary to
1823+
/// pad to. Default is no padding.
1824+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
1825+
18221826
/// @}
18231827

18241828
private:
@@ -2225,6 +2229,8 @@ class TargetTransformInfo::Concept {
22252229
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
22262230
virtual bool hasArmWideBranch(bool Thumb) const = 0;
22272231
virtual unsigned getMaxNumArgs() const = 0;
2232+
virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
2233+
Type *ArrayType) const = 0;
22282234
};
22292235

22302236
template <typename T>
@@ -3026,6 +3032,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
30263032
unsigned getMaxNumArgs() const override {
30273033
return Impl.getMaxNumArgs();
30283034
}
3035+
3036+
unsigned getNumBytesToPadGlobalArray(unsigned Size,
3037+
Type *ArrayType) const override {
3038+
return Impl.getNumBytesToPadGlobalArray(Size, ArrayType);
3039+
}
30293040
};
30303041

30313042
template <typename T>

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,6 +1006,10 @@ class TargetTransformInfoImplBase {
10061006

10071007
unsigned getMaxNumArgs() const { return UINT_MAX; }
10081008

1009+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
1010+
return 0;
1011+
}
1012+
10091013
protected:
10101014
// Obtain the minimum required size to hold the value (without the sign)
10111015
// In case of a vector it returns the min required size for one element.

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,6 +1383,12 @@ bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const {
13831383
return TTIImpl->isVectorShiftByScalarCheap(Ty);
13841384
}
13851385

1386+
unsigned
1387+
TargetTransformInfo::getNumBytesToPadGlobalArray(unsigned Size,
1388+
Type *ArrayType) const {
1389+
return TTIImpl->getNumBytesToPadGlobalArray(Size, ArrayType);
1390+
}
1391+
13861392
TargetTransformInfo::Concept::~Concept() = default;
13871393

13881394
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ static cl::opt<bool>
5656
AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
5757
cl::desc("Enable the generation of WLS loops"));
5858

59+
static cl::opt<bool> UseWidenGlobalArrays(
60+
"widen-global-strings", cl::Hidden, cl::init(true),
61+
cl::desc("Enable the widening of global strings to alignment boundaries"));
62+
5963
extern cl::opt<TailPredication::Mode> EnableTailPredication;
6064

6165
extern cl::opt<bool> EnableMaskedGatherScatters;
@@ -2805,3 +2809,32 @@ bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
28052809
}
28062810
return true;
28072811
}
2812+
2813+
unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
2814+
Type *ArrayType) const {
2815+
if (!UseWidenGlobalArrays) {
2816+
LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2817+
return false;
2818+
}
2819+
2820+
// Don't modify none integer array types
2821+
if (!ArrayType || !ArrayType->isArrayTy() ||
2822+
!ArrayType->getArrayElementType()->isIntegerTy())
2823+
return 0;
2824+
2825+
// We pad to 4 byte boundaries
2826+
if (Size % 4 == 0)
2827+
return 0;
2828+
2829+
unsigned NumBytesToPad = 4 - (Size % 4);
2830+
unsigned NewSize = Size + NumBytesToPad;
2831+
2832+
// Max number of bytes that memcpy allows for lowering to load/stores before
2833+
// it uses library function (__aeabi_memcpy).
2834+
unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2835+
2836+
if (NewSize > MaxMemIntrinsicSize)
2837+
return 0;
2838+
2839+
return NumBytesToPad;
2840+
}

llvm/lib/Target/ARM/ARMTargetTransformInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,9 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
337337

338338
bool isProfitableToSinkOperands(Instruction *I,
339339
SmallVectorImpl<Use *> &Ops) const;
340+
341+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
342+
340343
/// @}
341344
};
342345

llvm/lib/Transforms/IPO/GlobalOpt.cpp

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ STATISTIC(NumInternalFunc, "Number of internal functions");
9292
STATISTIC(NumColdCC, "Number of functions marked coldcc");
9393
STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
9494
STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
95+
STATISTIC(NumGlobalArraysPadded,
96+
"Number of global arrays padded to alignment boundary");
9597

9698
static cl::opt<bool>
9799
EnableColdCCStressTest("enable-coldcc-stress-test",
@@ -2029,6 +2031,165 @@ OptimizeFunctions(Module &M,
20292031
return Changed;
20302032
}
20312033

2034+
static bool callInstIsMemcpy(CallInst *CI) {
2035+
if (!CI)
2036+
return false;
2037+
2038+
Function *F = CI->getCalledFunction();
2039+
if (!F || !F->isIntrinsic() || F->getIntrinsicID() != Intrinsic::memcpy)
2040+
return false;
2041+
2042+
return true;
2043+
}
2044+
2045+
static bool destArrayCanBeWidened(CallInst *CI) {
2046+
auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
2047+
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
2048+
2049+
if (!Alloca || !IsVolatile || IsVolatile->isOne())
2050+
return false;
2051+
2052+
if (!Alloca->isStaticAlloca())
2053+
return false;
2054+
2055+
if (!Alloca->getAllocatedType()->isArrayTy())
2056+
return false;
2057+
2058+
return true;
2059+
}
2060+
2061+
static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F,
2062+
unsigned NumBytesToPad,
2063+
unsigned NumBytesToCopy) {
2064+
if (!OldVar->hasInitializer())
2065+
return nullptr;
2066+
2067+
ConstantDataArray *DataArray =
2068+
dyn_cast<ConstantDataArray>(OldVar->getInitializer());
2069+
if (!DataArray)
2070+
return nullptr;
2071+
2072+
// Update to be word aligned (memcpy(...,X,...))
2073+
// create replacement with padded null bytes.
2074+
StringRef Data = DataArray->getRawDataValues();
2075+
std::vector<uint8_t> StrData(Data.begin(), Data.end());
2076+
for (unsigned int p = 0; p < NumBytesToPad; p++)
2077+
StrData.push_back('\0');
2078+
auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad);
2079+
// Create new padded version of global variable.
2080+
Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
2081+
GlobalVariable *NewGV = new GlobalVariable(
2082+
*(F->getParent()), SourceReplace->getType(), true, OldVar->getLinkage(),
2083+
SourceReplace, SourceReplace->getName());
2084+
// Copy any other attributes from original global variable
2085+
// e.g. unamed_addr
2086+
NewGV->copyAttributesFrom(OldVar);
2087+
NewGV->takeName(OldVar);
2088+
return NewGV;
2089+
}
2090+
2091+
static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad,
2092+
const unsigned NumBytesToCopy,
2093+
ConstantDataArray *SourceDataArray) {
2094+
2095+
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
2096+
if (Alloca) {
2097+
unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
2098+
unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
2099+
unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
2100+
// Update destination array to be word aligned (memcpy(X,...,...))
2101+
IRBuilder<> BuildAlloca(Alloca);
2102+
AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
2103+
Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
2104+
NewAlloca->takeName(Alloca);
2105+
NewAlloca->setAlignment(Alloca->getAlign());
2106+
Alloca->replaceAllUsesWith(NewAlloca);
2107+
Alloca->eraseFromParent();
2108+
}
2109+
}
2110+
2111+
static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
2112+
const unsigned NumBytesToPad,
2113+
const unsigned NumBytesToCopy,
2114+
ConstantInt *BytesToCopyOp,
2115+
ConstantDataArray *SourceDataArray) {
2116+
auto *NewSourceGV =
2117+
widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy);
2118+
if (!NewSourceGV)
2119+
return false;
2120+
2121+
// Update arguments of remaining uses that
2122+
// are memcpys.
2123+
for (auto *User : SourceVar->users()) {
2124+
auto *CI = dyn_cast<CallInst>(User);
2125+
if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
2126+
continue;
2127+
2128+
if (CI->getArgOperand(1) != SourceVar)
2129+
continue;
2130+
2131+
widenDestArray(CI, NumBytesToPad, NumBytesToCopy, SourceDataArray);
2132+
2133+
CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
2134+
NumBytesToCopy + NumBytesToPad));
2135+
}
2136+
SourceVar->replaceAllUsesWith(NewSourceGV);
2137+
2138+
NumGlobalArraysPadded++;
2139+
return true;
2140+
}
2141+
2142+
static bool tryWidenGlobalArraysUsedByMemcpy(
2143+
GlobalVariable *GV,
2144+
function_ref<TargetTransformInfo &(Function &)> GetTTI) {
2145+
2146+
if (!GV->hasInitializer() || !GV->isConstant() || !GV->hasLocalLinkage() ||
2147+
!GV->hasGlobalUnnamedAddr())
2148+
return false;
2149+
2150+
for (auto *User : GV->users()) {
2151+
CallInst *CI = dyn_cast<CallInst>(User);
2152+
if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
2153+
continue;
2154+
2155+
Function *F = CI->getCalledFunction();
2156+
2157+
auto *BytesToCopyOp = dyn_cast<ConstantInt>(CI->getArgOperand(2));
2158+
if (!BytesToCopyOp)
2159+
continue;
2160+
2161+
ConstantDataArray *SourceDataArray =
2162+
dyn_cast<ConstantDataArray>(GV->getInitializer());
2163+
if (!SourceDataArray)
2164+
continue;
2165+
2166+
unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();
2167+
2168+
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
2169+
uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
2170+
uint64_t SZSize = SourceDataArray->getType()->getNumElements();
2171+
unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
2172+
// Calculate the number of elements to copy while avoiding floored
2173+
// division of integers returning wrong values i.e. copying one byte
2174+
// from an array of i16 would yield 0 elements to copy as supposed to 1.
2175+
unsigned NumElementsToCopy = divideCeil(NumBytesToCopy, ElementByteWidth);
2176+
2177+
// For safety purposes lets add a constraint and only pad when
2178+
// NumElementsToCopy == destination array size ==
2179+
// source which is a constant
2180+
if (NumElementsToCopy != DZSize || DZSize != SZSize)
2181+
continue;
2182+
2183+
unsigned NumBytesToPad = GetTTI(*F).getNumBytesToPadGlobalArray(
2184+
NumBytesToCopy, SourceDataArray->getType());
2185+
if (NumBytesToPad) {
2186+
return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy,
2187+
BytesToCopyOp, SourceDataArray);
2188+
}
2189+
}
2190+
return false;
2191+
}
2192+
20322193
static bool
20332194
OptimizeGlobalVars(Module &M,
20342195
function_ref<TargetTransformInfo &(Function &)> GetTTI,
@@ -2058,6 +2219,10 @@ OptimizeGlobalVars(Module &M,
20582219
continue;
20592220
}
20602221

2222+
// For global variable arrays called in a memcpy
2223+
// we try to pad to nearest valid alignment boundary
2224+
Changed |= tryWidenGlobalArraysUsedByMemcpy(&GV, GetTTI);
2225+
20612226
Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
20622227
}
20632228
return Changed;
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
3+
4+
@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
5+
6+
define void @memcpy_struct() {
7+
; CHECK-LABEL: define void @memcpy_struct() local_unnamed_addr {
8+
; CHECK-NEXT: [[ENTRY:.*:]]
9+
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca { i8, i8, i8 }, align 1
10+
; CHECK-NEXT: [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
11+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
12+
; CHECK-NEXT: ret void
13+
;
14+
entry:
15+
%something = alloca {i8, i8, i8}, align 1
16+
%call1 = call i32 @bar(ptr nonnull %something)
17+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
18+
ret void
19+
}
20+
21+
22+
@.i8_multi = private unnamed_addr constant [2 x [3 x i8]] [[3 x i8] [i8 1, i8 2, i8 3], [3 x i8] [i8 4, i8 5, i8 6]] , align 1
23+
24+
define void @memcpy_array_multidimensional() {
25+
; CHECK-LABEL: define void @memcpy_array_multidimensional() local_unnamed_addr {
26+
; CHECK-NEXT: [[ENTRY:.*:]]
27+
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca [2 x [3 x i8]], align 1
28+
; CHECK-NEXT: [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
29+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
30+
; CHECK-NEXT: ret void
31+
;
32+
entry:
33+
%something = alloca [2 x [3 x i8]], align 1
34+
%call1 = call i32 @bar(ptr nonnull %something)
35+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
36+
ret void
37+
}
38+
39+
declare i32 @bar(...)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
3+
4+
; CHECK: [3 x i8]
5+
@other = private unnamed_addr global [3 x i8] [i8 1, i8 2, i8 3] , align 1
6+
; CHECK: [4 x i8]
7+
@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
8+
9+
define void @memcpy_multiple() {
10+
; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
11+
; CHECK-NEXT: [[ENTRY:.*:]]
12+
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca [4 x i8], align 1
13+
; CHECK-NEXT: [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
14+
; CHECK-NEXT: [[CALL3:%.*]] = call i32 @bar(ptr nonnull @other)
15+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
16+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
17+
; CHECK-NEXT: ret void
18+
;
19+
entry:
20+
%something = alloca [3 x i8], align 1
21+
%call1 = call i32 @bar(ptr nonnull %something)
22+
%call2 = call i32 @bar(ptr nonnull @other)
23+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
24+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
25+
ret void
26+
}
27+
28+
declare i32 @bar(...)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
3+
4+
@.i16 = private unnamed_addr constant [5 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5] , align 1
5+
6+
define void @memcpy_i16_array() {
7+
; CHECK-LABEL: define void @memcpy_i16_array() local_unnamed_addr {
8+
; CHECK-NEXT: [[ENTRY:.*:]]
9+
; CHECK-NEXT: [[SOMETHING1:%.*]] = alloca [6 x i16], align 1
10+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 12, i1 false)
11+
; CHECK-NEXT: [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
12+
; CHECK-NEXT: ret void
13+
;
14+
entry:
15+
%something = alloca [5 x i16], align 1
16+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 10, i1 false)
17+
%call2 = call i32 @bar(ptr nonnull %something)
18+
ret void
19+
}
20+
21+
22+
declare i32 @bar(...)

0 commit comments

Comments
 (0)