Skip to content

Commit ca68a7f

Browse files
committed
Reapply: [MemCpyOpt] implement single BB stack-move optimization which unify the static unescaped allocas
This reverts commit 2077180.
1 parent a02ad6c commit ca68a7f

File tree

5 files changed

+320
-129
lines changed

5 files changed

+320
-129
lines changed

llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
namespace llvm {
2121

2222
class AAResults;
23+
class AllocaInst;
2324
class BatchAAResults;
2425
class AssumptionCache;
2526
class CallBase;
@@ -77,6 +78,9 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
7778
Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
7879
Value *ByteVal);
7980
bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
81+
bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
82+
AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
83+
uint64_t Size, BatchAAResults &BAA);
8084

8185
void eraseInstruction(Instruction *I);
8286
bool iterateOnFunction(Function &F);

llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Lines changed: 267 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,10 @@ static cl::opt<bool> EnableMemCpyOptWithoutLibcalls(
6666

6767
STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
6868
STATISTIC(NumMemSetInfer, "Number of memsets inferred");
69-
STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
70-
STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
71-
STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
69+
STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
70+
STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
71+
STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
72+
STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
7273

7374
namespace {
7475

@@ -730,6 +731,23 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
730731
return true;
731732
}
732733

734+
// If this is a load-store pair from a stack slot to a stack slot, we
735+
// might be able to perform the stack-move optimization just as we do for
736+
// memcpys from an alloca to an alloca.
737+
if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand())) {
738+
if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand())) {
739+
if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
740+
DL.getTypeStoreSize(T), BAA)) {
741+
// Avoid invalidating the iterator.
742+
BBI = SI->getNextNonDebugInstruction()->getIterator();
743+
eraseInstruction(SI);
744+
eraseInstruction(LI);
745+
++NumMemCpyInstr;
746+
return true;
747+
}
748+
}
749+
}
750+
733751
return false;
734752
}
735753

@@ -1408,6 +1426,227 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
14081426
return true;
14091427
}
14101428

1429+
// Attempts to optimize the pattern whereby memory is copied from an alloca to
1430+
// another alloca, where the two allocas don't have conflicting mod/ref. If
1431+
// successful, the two allocas can be merged into one and the transfer can be
1432+
// deleted. This pattern is generated frequently in Rust, due to the ubiquity of
1433+
// move operations in that language.
1434+
//
1435+
// Once we determine that the optimization is safe to perform, we replace all
1436+
// uses of the destination alloca with the source alloca. We also "shrink wrap"
1437+
// the lifetime markers of the single merged alloca to before the first use
1438+
// and after the last use. Note that the "shrink wrapping" procedure is a safe
1439+
// transformation only because we restrict the scope of this optimization to
1440+
// allocas that aren't captured.
1441+
bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
1442+
AllocaInst *DestAlloca,
1443+
AllocaInst *SrcAlloca, uint64_t Size,
1444+
BatchAAResults &BAA) {
1445+
LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
1446+
<< *Store << "\n");
1447+
1448+
// Make sure the two allocas are in the same address space.
1449+
if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
1450+
LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
1451+
return false;
1452+
}
1453+
1454+
// 1. Check that copy is full. Calculate the static size of the allocas to be
1455+
// merged, bail out if we can't.
1456+
const DataLayout &DL = DestAlloca->getModule()->getDataLayout();
1457+
std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
1458+
if (!SrcSize || SrcSize->isScalable() || Size != SrcSize->getFixedValue()) {
1459+
LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
1460+
return false;
1461+
}
1462+
std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
1463+
if (!DestSize || DestSize->isScalable() ||
1464+
Size != DestSize->getFixedValue()) {
1465+
LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
1466+
return false;
1467+
}
1468+
1469+
// 2-1. Check that src and dest are static allocas, which are not affected by
1470+
// stacksave/stackrestore.
1471+
if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca() ||
1472+
SrcAlloca->getParent() != Load->getParent() ||
1473+
SrcAlloca->getParent() != Store->getParent())
1474+
return false;
1475+
1476+
// 2-2. Check that src and dest are never captured, unescaped allocas. Also
1477+
// collect lifetime markers first/last users in order to shrink wrap the
1478+
// lifetimes, and instructions with noalias metadata to remove them.
1479+
1480+
SmallVector<Instruction *, 4> LifetimeMarkers;
1481+
Instruction *FirstUser = nullptr, *LastUser = nullptr;
1482+
SmallSet<Instruction *, 4> NoAliasInstrs;
1483+
1484+
// Recursively track the user and check whether modified alias exist.
1485+
auto IsDereferenceableOrNull = [](Value *V, const DataLayout &DL) -> bool {
1486+
bool CanBeNull, CanBeFreed;
1487+
return V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
1488+
};
1489+
1490+
auto CaptureTrackingWithModRef =
1491+
[&](Instruction *AI,
1492+
function_ref<bool(Instruction *)> ModRefCallback) -> bool {
1493+
SmallVector<Instruction *, 8> Worklist;
1494+
Worklist.push_back(AI);
1495+
unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
1496+
Worklist.reserve(MaxUsesToExplore);
1497+
SmallSet<const Use *, 20> Visited;
1498+
while (!Worklist.empty()) {
1499+
Instruction *I = Worklist.back();
1500+
Worklist.pop_back();
1501+
for (const Use &U : I->uses()) {
1502+
if (Visited.size() >= MaxUsesToExplore) {
1503+
LLVM_DEBUG(
1504+
dbgs()
1505+
<< "Stack Move: Exceeded max uses to see ModRef, bailing\n");
1506+
return false;
1507+
}
1508+
if (!Visited.insert(&U).second)
1509+
continue;
1510+
switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) {
1511+
case UseCaptureKind::MAY_CAPTURE:
1512+
return false;
1513+
case UseCaptureKind::PASSTHROUGH:
1514+
// Instructions cannot have non-instruction users.
1515+
Worklist.push_back(cast<Instruction>(U.getUser()));
1516+
continue;
1517+
case UseCaptureKind::NO_CAPTURE: {
1518+
auto *UI = cast<Instruction>(U.getUser());
1519+
if (DestAlloca->getParent() != UI->getParent())
1520+
return false;
1521+
if (!FirstUser || UI->comesBefore(FirstUser))
1522+
FirstUser = UI;
1523+
if (!LastUser || LastUser->comesBefore(UI))
1524+
LastUser = UI;
1525+
if (UI->isLifetimeStartOrEnd()) {
1526+
// We note the locations of these intrinsic calls so that we can
1527+
// delete them later if the optimization succeeds, this is safe
1528+
// since both llvm.lifetime.start and llvm.lifetime.end intrinsics
1529+
// practically fill all the bytes of the alloca with an undefined
1530+
// value, although conceptually marked as alive/dead.
1531+
int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
1532+
if (Size < 0 || Size == DestSize) {
1533+
LifetimeMarkers.push_back(UI);
1534+
continue;
1535+
}
1536+
}
1537+
if (UI->hasMetadata(LLVMContext::MD_noalias))
1538+
NoAliasInstrs.insert(UI);
1539+
if (!ModRefCallback(UI))
1540+
return false;
1541+
}
1542+
}
1543+
}
1544+
}
1545+
return true;
1546+
};
1547+
1548+
// 3. Check that dest has no Mod/Ref, except full size lifetime intrinsics,
1549+
// from the alloca to the Store.
1550+
ModRefInfo DestModRef = ModRefInfo::NoModRef;
1551+
MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
1552+
auto DestModRefCallback = [&](Instruction *UI) -> bool {
1553+
// We don't care about the store itself.
1554+
if (UI == Store)
1555+
return true;
1556+
ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc);
1557+
// FIXME: For multi-BB cases, we need to see reachability from it to
1558+
// store.
1559+
// Bailout if Dest may have any ModRef before Store.
1560+
if (UI->comesBefore(Store) && isModOrRefSet(Res))
1561+
return false;
1562+
DestModRef |= BAA.getModRefInfo(UI, DestLoc);
1563+
1564+
return true;
1565+
};
1566+
1567+
if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback))
1568+
return false;
1569+
1570+
// 3. Check that, from after the Load to the end of the BB,
1571+
// 3-1. if the dest has any Mod, src has no Ref, and
1572+
// 3-2. if the dest has any Ref, src has no Mod except full-sized lifetimes.
1573+
MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Size));
1574+
1575+
auto SrcModRefCallback = [&](Instruction *UI) -> bool {
1576+
// Any ModRef before Load doesn't matter, also Load and Store can be
1577+
// ignored.
1578+
if (UI->comesBefore(Load) || UI == Load || UI == Store)
1579+
return true;
1580+
ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc);
1581+
if ((isModSet(DestModRef) && isRefSet(Res)) ||
1582+
(isRefSet(DestModRef) && isModSet(Res)))
1583+
return false;
1584+
1585+
return true;
1586+
};
1587+
1588+
if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
1589+
return false;
1590+
1591+
// We can do the transformation. First, align the allocas appropriately.
1592+
SrcAlloca->setAlignment(
1593+
std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
1594+
1595+
// Merge the two allocas.
1596+
DestAlloca->replaceAllUsesWith(SrcAlloca);
1597+
eraseInstruction(DestAlloca);
1598+
1599+
// Drop metadata on the source alloca.
1600+
SrcAlloca->dropUnknownNonDebugMetadata();
1601+
1602+
// Do "shrink wrap" the lifetimes, if the original lifetime intrinsics exists.
1603+
if (!LifetimeMarkers.empty()) {
1604+
LLVMContext &C = SrcAlloca->getContext();
1605+
IRBuilder<> Builder(C);
1606+
1607+
ConstantInt *AllocaSize = ConstantInt::get(Type::getInt64Ty(C), Size);
1608+
// Create a new lifetime start marker before the first user of src or alloca
1609+
// users.
1610+
Builder.SetInsertPoint(FirstUser->getParent(), FirstUser->getIterator());
1611+
auto *Start = Builder.CreateLifetimeStart(SrcAlloca, AllocaSize);
1612+
auto *FirstMA = MSSA->getMemoryAccess(FirstUser);
1613+
auto *StartMA = MSSAU->createMemoryAccessBefore(
1614+
Start, FirstMA->getDefiningAccess(), FirstMA);
1615+
MSSAU->insertDef(cast<MemoryDef>(StartMA), /*RenameUses=*/true);
1616+
1617+
// Create a new lifetime end marker after the last user of src or alloca
1618+
// users.
1619+
// FIXME: If the last user is the terminator for the bb, we can insert
1620+
// lifetime.end marker to the immidiate post-dominator, but currently do
1621+
// nothing.
1622+
if (!LastUser->isTerminator()) {
1623+
Builder.SetInsertPoint(LastUser->getParent(), ++LastUser->getIterator());
1624+
auto *End = Builder.CreateLifetimeEnd(SrcAlloca, AllocaSize);
1625+
auto *LastMA = MSSA->getMemoryAccess(LastUser);
1626+
// FIXME: the second argument should be LastMA if LastMA is MemoryDef, but
1627+
// that's updated by insertDef.
1628+
auto *EndMA = MSSAU->createMemoryAccessAfter(
1629+
End, LastMA->getDefiningAccess(), LastMA);
1630+
MSSAU->insertDef(cast<MemoryDef>(EndMA), /*RenameUses=*/true);
1631+
}
1632+
1633+
// Remove all other lifetime markers.
1634+
for (Instruction *I : LifetimeMarkers)
1635+
eraseInstruction(I);
1636+
}
1637+
1638+
// As this transformation can cause memory accesses that didn't previously
1639+
// alias to begin to alias one another, we remove !noalias metadata from any
1640+
// uses of either alloca. This is conservative, but more precision doesn't
1641+
// seem worthwhile right now.
1642+
for (Instruction *I : NoAliasInstrs)
1643+
I->setMetadata(LLVMContext::MD_noalias, nullptr);
1644+
1645+
LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n");
1646+
NumStackMove++;
1647+
return true;
1648+
}
1649+
14111650
/// Perform simplification of memcpy's. If we have memcpy A
14121651
/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
14131652
/// B to be a memcpy from X to Z (or potentially a memmove, depending on
@@ -1464,13 +1703,14 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
14641703
MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
14651704
AnyClobber, MemoryLocation::getForSource(M), BAA);
14661705

1467-
// There are four possible optimizations we can do for memcpy:
1706+
// There are five possible optimizations we can do for memcpy:
14681707
// a) memcpy-memcpy xform which exposes redundance for DSE.
14691708
// b) call-memcpy xform for return slot optimization.
14701709
// c) memcpy from freshly alloca'd space or space that has just started
14711710
// its lifetime copies undefined data, and we can therefore eliminate
14721711
// the memcpy in favor of the data that was already at the destination.
14731712
// d) memcpy from a just-memset'd source can be turned into memset.
1713+
// e) elimination of memcpy via stack-move optimization.
14741714
if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
14751715
if (Instruction *MI = MD->getMemoryInst()) {
14761716
if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
@@ -1489,7 +1729,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
14891729
}
14901730
}
14911731
if (auto *MDep = dyn_cast<MemCpyInst>(MI))
1492-
return processMemCpyMemCpyDependence(M, MDep, BAA);
1732+
if (processMemCpyMemCpyDependence(M, MDep, BAA))
1733+
return true;
14931734
if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
14941735
if (performMemCpyToMemSetOptzn(M, MDep, BAA)) {
14951736
LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
@@ -1508,6 +1749,27 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
15081749
}
15091750
}
15101751

1752+
// If the transfer is from a stack slot to a stack slot, then we may be able
1753+
// to perform the stack-move optimization. See the comments in
1754+
// performStackMoveOptzn() for more details.
1755+
auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
1756+
if (!DestAlloca)
1757+
return false;
1758+
auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
1759+
if (!SrcAlloca)
1760+
return false;
1761+
ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
1762+
if (Len == nullptr)
1763+
return false;
1764+
if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca, Len->getZExtValue(),
1765+
BAA)) {
1766+
// Avoid invalidating the iterator.
1767+
BBI = M->getNextNonDebugInstruction()->getIterator();
1768+
eraseInstruction(M);
1769+
++NumMemCpyInstr;
1770+
return true;
1771+
}
1772+
15111773
return false;
15121774
}
15131775

llvm/test/Transforms/MemCpyOpt/callslot.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,9 @@ define void @write_dest_between_call_and_memcpy() {
5656

5757
define void @write_src_between_call_and_memcpy() {
5858
; CHECK-LABEL: @write_src_between_call_and_memcpy(
59-
; CHECK-NEXT: [[DEST:%.*]] = alloca [16 x i8], align 1
6059
; CHECK-NEXT: [[SRC:%.*]] = alloca [16 x i8], align 1
6160
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[SRC]], i8 0, i64 16, i1 false)
6261
; CHECK-NEXT: store i8 1, ptr [[SRC]], align 1
63-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 16, i1 false)
6462
; CHECK-NEXT: ret void
6563
;
6664
%dest = alloca [16 x i8]

llvm/test/Transforms/MemCpyOpt/lifetime-missing.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,14 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
1313
define void @test() {
1414
; CHECK-LABEL: define void @test() {
1515
; CHECK-NEXT: entry:
16-
; CHECK-NEXT: [[AGG_TMP3_SROA_35:%.*]] = alloca [20 x i8], align 4
1716
; CHECK-NEXT: [[AGG_TMP_SROA_14:%.*]] = alloca [20 x i8], align 4
1817
; CHECK-NEXT: [[AGG_TMP_SROA_14_128_SROA_IDX:%.*]] = getelementptr i8, ptr [[AGG_TMP_SROA_14]], i64 4
18+
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 20, ptr [[AGG_TMP_SROA_14]])
1919
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[AGG_TMP_SROA_14_128_SROA_IDX]], i8 0, i64 1, i1 false)
20-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 20, ptr [[AGG_TMP3_SROA_35]])
21-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[AGG_TMP3_SROA_35]], ptr [[AGG_TMP_SROA_14]], i64 20, i1 false)
22-
; CHECK-NEXT: [[AGG_TMP3_SROA_35_128_SROA_IDX:%.*]] = getelementptr i8, ptr [[AGG_TMP3_SROA_35]], i64 4
23-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr inttoptr (i64 4 to ptr), ptr [[AGG_TMP3_SROA_35_128_SROA_IDX]], i64 1, i1 false)
24-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr null, ptr [[AGG_TMP3_SROA_35_128_SROA_IDX]], i64 1, i1 false)
20+
; CHECK-NEXT: [[AGG_TMP3_SROA_35_128_SROA_IDX:%.*]] = getelementptr i8, ptr [[AGG_TMP_SROA_14]], i64 4
21+
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr inttoptr (i64 4 to ptr), i8 0, i64 1, i1 false)
22+
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 20, ptr [[AGG_TMP_SROA_14]])
23+
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr null, i8 0, i64 1, i1 false)
2524
; CHECK-NEXT: ret void
2625
;
2726
entry:

0 commit comments

Comments
 (0)