Skip to content

Commit 827019c

Browse files
committed
add memcpy removal for simple dest
When code has the pattern of a alloca with only only memcpy followed by just reads of that result, we can remove the memcpy if we can show it legal to just forward the user to the source directly. This is a generalization of some of the other passes in the file, since it can hoist even if it understands very little about the subsequent operations, while the existing passes require specific analysis of the subsequent instruction (such as memcpy-to-memcpy) to be able to do their designed replacements. More precision will be achieved (for cond_nohoist and phi_nohoist tests) if https://reviews.llvm.org/D119929?id=409338 lands. The following is an (exhaustive) list of all existing tests which trigger the new optimization: Clang :: CodeGen/attr-counted-by.c LLVM :: Analysis/ScopedNoAliasAA/alias-scope-merging.ll LLVM :: Transforms/MemCpyOpt/callslot_badaa.ll LLVM :: Transforms/MemCpyOpt/lifetime.ll LLVM :: Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll LLVM :: Transforms/MemCpyOpt/memcpy.ll
1 parent 0d828e6 commit 827019c

File tree

10 files changed

+778
-83
lines changed

10 files changed

+778
-83
lines changed

clang/test/CodeGen/attr-counted-by.c

Lines changed: 35 additions & 41 deletions
Large diffs are not rendered by default.

llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,15 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
7979
BatchAAResults &BAA);
8080
bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet,
8181
BatchAAResults &BAA);
82+
bool performStackHoistOptzn(MemCpyInst *M, AllocaInst *DestAlloca,
83+
TypeSize Size, BatchAAResults &BAA);
8284
bool processByValArgument(CallBase &CB, unsigned ArgNo);
8385
bool processImmutArgument(CallBase &CB, unsigned ArgNo);
8486
Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
8587
Value *ByteVal);
8688
bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
89+
bool canMoveUp(Instruction *I, Instruction *P);
90+
void moveUp(Instruction *I, Instruction *P);
8791
bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
8892
AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
8993
TypeSize Size, BatchAAResults &BAA);

llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Lines changed: 269 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1527,6 +1527,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
15271527
}
15281528

15291529
// Check that copy is full with static size.
1530+
// TODO: use coversInputFully instead here
15301531
const DataLayout &DL = DestAlloca->getDataLayout();
15311532
std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
15321533
if (!SrcSize || Size != *SrcSize) {
@@ -1719,6 +1720,245 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
17191720
return true;
17201721
}
17211722

1723+
// This method checks if possible to lift an instruction to position P.
1724+
// It will check that it could lift the instruction and its argument.
1725+
// The method returns true if it would be successful.
1726+
bool MemCpyOptPass::canMoveUp(Instruction *I, Instruction *P) {
1727+
if (I->getIterator() == std::next(P->getIterator()))
1728+
return true;
1729+
// TODO: check isGuaranteedToTransferExecutionToSuccessor on all instructions
1730+
// between I and P
1731+
SmallSet<const Instruction *, 20> Visited;
1732+
SmallVector<Instruction *, 8> Worklist;
1733+
Worklist.push_back(I);
1734+
while (!Worklist.empty()) {
1735+
Instruction *I = Worklist.pop_back_val();
1736+
for (Value *Op : I->operands()) {
1737+
Instruction *U = dyn_cast<Instruction>(Op);
1738+
if (U && !DT->dominates(U, P)) { // (U dom P) is sufficient, but is it the
1739+
// necessary condition?
1740+
// Cannot hoist if this has visible side-effects or is a PHINode (which
1741+
// would require converting to a SelectInst in moveUp)
1742+
if (isa<PHINode>(U) || U->mayReadOrWriteMemory())
1743+
return false;
1744+
if (!Visited.insert(U).second)
1745+
continue; // Already examined this in another branch
1746+
assert(DT->dominates(P, U)); // Ensure that SSA is correct
1747+
Worklist.push_back(U);
1748+
}
1749+
}
1750+
}
1751+
return true;
1752+
}
1753+
1754+
// Unconditionally move I to P, including all operands
1755+
void MemCpyOptPass::moveUp(Instruction *I, Instruction *P) {
1756+
if (I->getIterator() == std::next(P->getIterator()))
1757+
return;
1758+
// Compute a valid instruction order with DFS walk
1759+
SmallSet<const Instruction *, 20> Lifted;
1760+
SmallVector<Instruction *, 8> PreOrder;
1761+
SmallVector<Instruction *, 8> PostOrder;
1762+
PreOrder.push_back(I);
1763+
while (!PreOrder.empty()) {
1764+
Instruction *I = PreOrder.back();
1765+
if (Lifted.count(I))
1766+
continue;
1767+
for (Value *Op : I->operands()) {
1768+
Instruction *U = dyn_cast<Instruction>(Op);
1769+
if (U && !Lifted.count(U) && !DT->dominates(U, P)) {
1770+
assert(!isa<PHINode>(U));
1771+
PreOrder.push_back(U);
1772+
}
1773+
}
1774+
if (PreOrder.back() == I) {
1775+
// All ops scheduled, can now schedule this too
1776+
Lifted.insert(I);
1777+
PreOrder.pop_back();
1778+
PostOrder.push_back(I);
1779+
}
1780+
}
1781+
1782+
MemoryUseOrDef *MemInsertPoint = MSSA->getMemoryAccess(P);
1783+
assert(MemInsertPoint);
1784+
1785+
// Now move them
1786+
for (Instruction *I : PostOrder) {
1787+
LLVM_DEBUG(dbgs() << "Lifting " << *I << " after " << *P << "\n");
1788+
if (I->getIterator() == std::next(P->getIterator())) {
1789+
P = I;
1790+
continue;
1791+
}
1792+
I->moveAfter(P);
1793+
P = I;
1794+
if (MemoryUseOrDef *MA = MSSA->getMemoryAccess(I)) {
1795+
MSSAU->moveAfter(MA, MemInsertPoint);
1796+
MemInsertPoint = MA;
1797+
}
1798+
}
1799+
}
1800+
1801+
bool MemCpyOptPass::performStackHoistOptzn(MemCpyInst *M,
1802+
AllocaInst *DestAlloca,
1803+
TypeSize Size, BatchAAResults &BAA) {
1804+
LLVM_DEBUG(dbgs() << "Stack Hoist: Attempting to optimize:\n" << *M << "\n");
1805+
1806+
// TODO: this should be fine as long as there are only LoadInst and GEP, but
1807+
// we would need more effort to re-write the GEP addrspaces
1808+
if (DestAlloca->getType() != M->getSource()->getType())
1809+
return false;
1810+
1811+
// Check that copy is full with static size.
1812+
// TODO: use coversInputFully instead here
1813+
const DataLayout &DL = DestAlloca->getDataLayout();
1814+
std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
1815+
if (!DestSize || Size != *DestSize) {
1816+
LLVM_DEBUG(dbgs() << "Stack Hoist: Destination alloca size mismatch\n");
1817+
return false;
1818+
}
1819+
1820+
MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
1821+
if (!MA)
1822+
// Degenerate case: memcpy marked as not accessing memory.
1823+
return false;
1824+
1825+
// The source align must be larger than or equal the alloca's
1826+
// align. If not so, we check to see if we can force the source of the memcpy
1827+
// to the alignment we need. If we fail, we bail out.
1828+
// TODO: we don't have to bail out if we only have LoadInst or other
1829+
// intrinsics (e.g. memcpy) and can instead reduce the alignment of the use.
1830+
Align MAlign = M->getSourceAlign().valueOrOne();
1831+
Align AllocaAlign = DestAlloca->getAlign();
1832+
if (MAlign < AllocaAlign &&
1833+
getOrEnforceKnownAlignment(M->getSource(), AllocaAlign, DL, M, AC, DT) <
1834+
AllocaAlign)
1835+
return false;
1836+
1837+
// Check that dest is never captured, unescaped alloca.
1838+
MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
1839+
MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
1840+
SmallVector<Instruction *, 4> LifetimeMarkers;
1841+
SmallVector<std::pair<Instruction *, bool>, 4> AAMetadataInstrs;
1842+
1843+
SmallVector<std::pair<Instruction *, bool>, 8> Worklist;
1844+
Worklist.push_back({DestAlloca, true});
1845+
unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
1846+
Worklist.reserve(MaxUsesToExplore);
1847+
SmallSet<const Instruction *, 20> Visited;
1848+
while (!Worklist.empty()) {
1849+
Instruction *I;
1850+
bool mayMove;
1851+
std::tie(I, mayMove) = Worklist.pop_back_val();
1852+
for (const Use &U : I->uses()) {
1853+
auto *UI = cast<Instruction>(U.getUser());
1854+
// We don't care about the store itself.
1855+
if (UI == M)
1856+
continue;
1857+
if (UI->isLifetimeStartOrEnd()) {
1858+
LifetimeMarkers.push_back(UI);
1859+
continue;
1860+
}
1861+
if (Visited.size() >= MaxUsesToExplore) {
1862+
LLVM_DEBUG(
1863+
dbgs()
1864+
<< "Stack Hoist: Exceeded max uses to see ModRef, bailing\n");
1865+
return false;
1866+
}
1867+
if (!Visited.insert(UI).second)
1868+
continue;
1869+
UseCaptureInfo CI(CaptureComponents::None, CaptureComponents::None);
1870+
for (const Use &U : UI->operands()) {
1871+
auto UCI = DetermineUseCaptureKind(U, DestAlloca);
1872+
CI.UseCC |= UCI.UseCC;
1873+
CI.ResultCC |= UCI.ResultCC;
1874+
}
1875+
if (capturesAnything(CI.UseCC))
1876+
return false;
1877+
if (!DT->dominates(UI, M)) {
1878+
if (UI->mayWriteToMemory()) {
1879+
ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc);
1880+
if (isModSet(Res))
1881+
return false;
1882+
mayMove = false;
1883+
}
1884+
if (UI->mayReadFromMemory()) { // TODO: (U dom P) is sufficient, but is
1885+
// it the necessary condition?
1886+
ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc);
1887+
if (isRefSet(Res)) {
1888+
if (!DT->dominates(M, UI))
1889+
// TODO: does writtenBetween care about dom, or just moveUp?
1890+
return false;
1891+
// If UI can modify SrcLoc, then we cannot alias it to DestLoc
1892+
ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc);
1893+
if (isModSet(Res))
1894+
return false;
1895+
bool moveUp = writtenBetween(MSSA, BAA, SrcLoc, MA,
1896+
MSSA->getMemoryAccess(UI));
1897+
if (moveUp) {
1898+
// It is safe to move this read up as long as the memory it reads
1899+
// doesn't change between UI and M (such as only reading DestLoc
1900+
// with a LoadInst, or argmemonly), and UI post-dominates M (so
1901+
// we are guaranteed to eventually execute UI)
1902+
if (!mayMove)
1903+
return false;
1904+
if (!isa<LoadInst>(UI))
1905+
return false;
1906+
if (!PDT->dominates(UI, M))
1907+
return false;
1908+
}
1909+
AAMetadataInstrs.push_back({UI, moveUp});
1910+
}
1911+
}
1912+
}
1913+
if (capturesAnything(CI.ResultCC)) {
1914+
if (mayMove)
1915+
// If this instruction may capture something other than UI (such as a
1916+
// SelectInst) or have other side-effects (such as a call), then we
1917+
// should not try to move this instruction, or instructions that use
1918+
// this. (canMoveUp will later check this for the other operands too)
1919+
mayMove = isa<GEPOperator>(UI);
1920+
Worklist.push_back({UI, mayMove});
1921+
}
1922+
}
1923+
}
1924+
1925+
// Nothing useful to to (the alloca is dead anyways and later passes will
1926+
// likely remove it) Exit early now to minimize spurious test changes
1927+
// TODO: delete in a later PR
1928+
if (AAMetadataInstrs.empty())
1929+
return false;
1930+
1931+
// Check that all instructions we need to move will be able to move
1932+
// Otherwise it may not be worthwhile to move any of them.
1933+
for (auto &I : AAMetadataInstrs) {
1934+
if (I.second && !canMoveUp(I.first, M))
1935+
return false;
1936+
}
1937+
1938+
// Remove all lifetime markers before RAUW since they no longer apply.
1939+
for (Instruction *I : LifetimeMarkers)
1940+
eraseInstruction(I);
1941+
1942+
// As this transformation can cause memory accesses that didn't previously
1943+
// alias to begin to alias one another, we remove !alias.scope, !noalias,
1944+
// !tbaa and !tbaa_struct metadata from any uses of either alloca.
1945+
// This is conservative, but more precision (replacing with the source tags
1946+
// from M) doesn't seem worthwhile right now.
1947+
for (auto &U : AAMetadataInstrs) {
1948+
Instruction *I = U.first;
1949+
I->setMetadata(LLVMContext::MD_alias_scope, nullptr);
1950+
I->setMetadata(LLVMContext::MD_noalias, nullptr);
1951+
I->setMetadata(LLVMContext::MD_tbaa, nullptr);
1952+
I->setMetadata(LLVMContext::MD_tbaa_struct, nullptr);
1953+
if (U.second)
1954+
moveUp(I, M);
1955+
}
1956+
DestAlloca->replaceAllUsesWith(M->getSource());
1957+
eraseInstruction(DestAlloca);
1958+
1959+
return true;
1960+
}
1961+
17221962
static bool isZeroSize(Value *Size) {
17231963
if (auto *I = dyn_cast<Instruction>(Size))
17241964
if (auto *Res = simplifyInstruction(I, I->getDataLayout()))
@@ -1842,25 +2082,39 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
18422082
}
18432083
}
18442084

1845-
// If the transfer is from a stack slot to a stack slot, then we may be able
1846-
// to perform the stack-move optimization. See the comments in
1847-
// performStackMoveOptzn() for more details.
18482085
auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
1849-
if (!DestAlloca)
1850-
return false;
18512086
auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
1852-
if (!SrcAlloca)
2087+
// Remaining transforms are mainly only interesting if we can eliminate some
2088+
// stack data
2089+
if (!DestAlloca && !SrcAlloca)
18532090
return false;
2091+
18542092
ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
1855-
if (Len == nullptr)
1856-
return false;
1857-
if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
1858-
TypeSize::getFixed(Len->getZExtValue()), BAA)) {
1859-
// Avoid invalidating the iterator.
1860-
BBI = M->getNextNonDebugInstruction()->getIterator();
1861-
eraseInstruction(M);
1862-
++NumMemCpyInstr;
1863-
return true;
2093+
// If the transfer is from a stack slot to a stack slot, then we may be able
2094+
// to perform the stack-move optimization. See the comments in
2095+
// performStackMoveOptzn() for more details.
2096+
if (DestAlloca && SrcAlloca && Len) {
2097+
if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
2098+
TypeSize::getFixed(Len->getZExtValue()), BAA)) {
2099+
// Avoid invalidating the iterator.
2100+
BBI = M->getNextNonDebugInstruction()->getIterator();
2101+
eraseInstruction(M);
2102+
++NumMemCpyInstr;
2103+
return true;
2104+
}
2105+
}
2106+
2107+
if (DestAlloca && Len) {
2108+
// If the transfer is to a stack slot, see if we can hoist all the uses of
2109+
// the stack slot to here instead
2110+
if (performStackHoistOptzn(M, DestAlloca,
2111+
TypeSize::getFixed(Len->getZExtValue()), BAA)) {
2112+
// Avoid invalidating the iterator.
2113+
BBI = M->getNextNonDebugInstruction()->getIterator();
2114+
eraseInstruction(M);
2115+
++NumMemCpyInstr;
2116+
return true;
2117+
}
18642118
}
18652119

18662120
return false;

llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,21 @@
1-
; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -S -passes=memcpyopt | FileCheck %s
23

34
; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains
45
define i8 @test(i8 %input) {
6+
; CHECK-LABEL: define i8 @test(
7+
; CHECK-SAME: i8 [[INPUT:%.*]]) {
8+
; CHECK-NEXT: [[DST:%.*]] = alloca i8, align 1
9+
; CHECK-NEXT: [[SRC:%.*]] = alloca i8, align 1
10+
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[SRC]]), !noalias [[META0:![0-9]+]]
11+
; CHECK-NEXT: store i8 [[INPUT]], ptr [[SRC]], align 1
12+
; CHECK-NEXT: [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
13+
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[SRC]]), !noalias [[META0]]
14+
; CHECK-NEXT: ret i8 [[RET_VALUE]]
15+
;
516
%tmp = alloca i8
617
%dst = alloca i8
718
%src = alloca i8
8-
; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]]
919
call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4
1020
store i8 %input, ptr %src
1121
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
@@ -16,9 +26,6 @@ define i8 @test(i8 %input) {
1626
}
1727

1828
; Merged scope contains "callee0: %a" and "callee0 : %b"
19-
; CHECK-DAG: ![[CALLEE0_A:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %a"}
20-
; CHECK-DAG: ![[CALLEE0_B:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %b"}
21-
; CHECK-DAG: ![[SCOPE]] = !{![[CALLEE0_A]], ![[CALLEE0_B]]}
2229

2330
declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
2431
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
@@ -35,3 +42,10 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
3542

3643
!7 = distinct !{!7, !8, !"callee2: %a"}
3744
!8 = distinct !{!8, !"callee2"}
45+
;.
46+
; CHECK: [[META0]] = !{[[META1:![0-9]+]], [[META3:![0-9]+]]}
47+
; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"callee0: %b"}
48+
; CHECK: [[META2]] = distinct !{[[META2]], !"callee0"}
49+
; CHECK: [[META3]] = distinct !{[[META3]], [[META4:![0-9]+]], !"callee1: %a"}
50+
; CHECK: [[META4]] = distinct !{[[META4]], !"callee1"}
51+
;.

0 commit comments

Comments
 (0)