Skip to content

Commit 7c51669

Browse files
committed
[memcpyopt] Restructure store(load src, dest) form of callslotopt for compile time
The search for the clobbering call is fairly expensive if uses are not optimized at construction. Defer the clobber walk to the point in the implementation we need it; there are a bunch of bailouts before that point. (e.g. If the source pointer is not an alloca, we can't do callslotopt.) On a test case which involves a bunch of copies from argument pointers, this switches memcpyopt from > 1/2 second to < 10ms.
1 parent c0f90c8 commit 7c51669

File tree

2 files changed

+33
-29
lines changed

2 files changed

+33
-29
lines changed

llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
6161
bool processMemMove(MemMoveInst *M);
6262
bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
6363
Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
64-
Align cpyAlign, CallInst *C);
64+
Align cpyAlign, std::function<CallInst *()> GetC);
6565
bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
6666
bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet);
6767
bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet);

llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -761,27 +761,25 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
761761
// Detect cases where we're performing call slot forwarding, but
762762
// happen to be using a load-store pair to implement it, rather than
763763
// a memcpy.
764-
CallInst *C = nullptr;
765-
if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
766-
MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
767-
// The load most post-dom the call. Limit to the same block for now.
768-
// TODO: Support non-local call-slot optimization?
769-
if (LoadClobber->getBlock() == SI->getParent())
770-
C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
771-
}
772-
773-
if (C) {
774-
bool changed = performCallSlotOptzn(
775-
LI, SI, SI->getPointerOperand()->stripPointerCasts(),
776-
LI->getPointerOperand()->stripPointerCasts(),
777-
DL.getTypeStoreSize(SI->getOperand(0)->getType()),
778-
commonAlignment(SI->getAlign(), LI->getAlign()), C);
779-
if (changed) {
780-
eraseInstruction(SI);
781-
eraseInstruction(LI);
782-
++NumMemCpyInstr;
783-
return true;
784-
}
764+
auto GetCall = [&]() -> CallInst * {
765+
// We defer this expensive clobber walk until the cheap checks
766+
// have been done on the source inside performCallSlotOptzn.
767+
if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
768+
MSSA->getWalker()->getClobberingMemoryAccess(LI)))
769+
return dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
770+
return nullptr;
771+
};
772+
773+
bool changed = performCallSlotOptzn(
774+
LI, SI, SI->getPointerOperand()->stripPointerCasts(),
775+
LI->getPointerOperand()->stripPointerCasts(),
776+
DL.getTypeStoreSize(SI->getOperand(0)->getType()),
777+
commonAlignment(SI->getAlign(), LI->getAlign()), GetCall);
778+
if (changed) {
779+
eraseInstruction(SI);
780+
eraseInstruction(LI);
781+
++NumMemCpyInstr;
782+
return true;
785783
}
786784
}
787785
}
@@ -856,7 +854,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
856854
bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
857855
Instruction *cpyStore, Value *cpyDest,
858856
Value *cpySrc, TypeSize cpySize,
859-
Align cpyAlign, CallInst *C) {
857+
Align cpyAlign,
858+
std::function<CallInst *()> GetC) {
860859
// The general transformation to keep in mind is
861860
//
862861
// call @func(..., src, ...)
@@ -875,11 +874,6 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
875874
if (cpySize.isScalable())
876875
return false;
877876

878-
// Lifetime marks shouldn't be operated on.
879-
if (Function *F = C->getCalledFunction())
880-
if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
881-
return false;
882-
883877
// Require that src be an alloca. This simplifies the reasoning considerably.
884878
auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
885879
if (!srcAlloca)
@@ -896,6 +890,16 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
896890
if (cpySize < srcSize)
897891
return false;
898892

893+
CallInst *C = GetC();
894+
if (!C)
895+
return false;
896+
897+
// Lifetime marks shouldn't be operated on.
898+
if (Function *F = C->getCalledFunction())
899+
if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
900+
return false;
901+
902+
899903
if (C->getParent() != cpyStore->getParent()) {
900904
LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n");
901905
return false;
@@ -1459,7 +1463,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
14591463
if (performCallSlotOptzn(
14601464
M, M, M->getDest(), M->getSource(),
14611465
TypeSize::getFixed(CopySize->getZExtValue()), Alignment,
1462-
C)) {
1466+
[C]() -> CallInst * { return C; })) {
14631467
LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
14641468
<< " call: " << *C << "\n"
14651469
<< " memcpy: " << *M << "\n");

0 commit comments

Comments
 (0)