Skip to content

Commit 645f247

Browse files
committed
[memcpyopt] allow more memcpy-to-memcpy optimziation
Allow the memcpy-to-memcpy optimization even when the sizes are not identical. For example, it might have been generated as a small slice of a larger struct (currently only for zero offset however), or might be only storing to part of an oversized alloca.
1 parent a2f0414 commit 645f247

File tree

3 files changed

+97
-53
lines changed

3 files changed

+97
-53
lines changed

llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Lines changed: 89 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "llvm/Analysis/CFG.h"
2424
#include "llvm/Analysis/CaptureTracking.h"
2525
#include "llvm/Analysis/GlobalsModRef.h"
26+
#include "llvm/Analysis/InstSimplifyFolder.h"
2627
#include "llvm/Analysis/InstructionSimplify.h"
2728
#include "llvm/Analysis/Loads.h"
2829
#include "llvm/Analysis/MemoryLocation.h"
@@ -1431,6 +1432,28 @@ static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy,
14311432
return false;
14321433
}
14331434

1435+
// If only the MemSrc instruction is known, a similar but slightly weaker
1436+
// analysis can apply
1437+
static bool anyOverreadUndefContents(MemorySSA *MSSA, Instruction *Store,
1438+
BatchAAResults &BAA) {
1439+
MemoryLocation Loc;
1440+
Value *Ptr;
1441+
if (auto SI = dyn_cast<StoreInst>(Store)) {
1442+
Loc = MemoryLocation::get(SI);
1443+
Ptr = SI->getPointerOperand();
1444+
} else if (auto MI = dyn_cast<MemCpyInst>(Store)) {
1445+
Loc = MemoryLocation::getForDest(MI);
1446+
Ptr = MI->getDest();
1447+
} else {
1448+
llvm_unreachable("performStackMoveOptzn must have a known store kind");
1449+
}
1450+
MemoryAccess *MemAccess = MSSA->getMemoryAccess(Store)->getDefiningAccess();
1451+
if (hadUndefContentsBefore(MSSA, BAA, Ptr, MemAccess, Loc, nullptr))
1452+
return true;
1453+
return false;
1454+
}
1455+
1456+
14341457
/// Transform memcpy to memset when its source was just memset.
14351458
/// In other words, turn:
14361459
/// \code
@@ -1524,30 +1547,49 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
15241547
return false;
15251548
}
15261549

1527-
// Check that copy is full with static size.
1528-
const DataLayout &DL = DestAlloca->getDataLayout();
1529-
std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
1530-
if (!SrcSize || Size != *SrcSize) {
1531-
LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
1532-
return false;
1533-
}
1534-
std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
1535-
if (!DestSize || Size != *DestSize) {
1536-
LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
1537-
return false;
1538-
}
1539-
15401550
if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca())
15411551
return false;
15421552

1553+
Type *SrcType = SrcAlloca->getAllocatedType();
1554+
Type *DestType = DestAlloca->getAllocatedType();
1555+
// If they don't have common type, then they will need to be converted to a
1556+
// common size at runtime
1557+
const auto &DL = SrcAlloca->getDataLayout();
1558+
TypeSize SrcSize = DL.getTypeAllocSize(SrcType);
1559+
TypeSize DestSize = DL.getTypeAllocSize(DestType);
1560+
if (SrcType != DestType)
1561+
if (SrcSize != DestSize)
1562+
if (!SrcSize.isFixed() || !DestSize.isFixed())
1563+
return false;
1564+
1565+
// Check that copy is full with dest size, either because it wrote every byte,
1566+
// or it was fresh.
1567+
std::optional<TypeSize> FullSize = DestAlloca->getAllocationSize(DL);
1568+
if (!FullSize || Size != *FullSize)
1569+
if (!anyOverreadUndefContents(MSSA, Store, BAA)) {
1570+
LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
1571+
return false;
1572+
}
1573+
1574+
// Check if it will be legal to combine allocas without breaking dominator.
1575+
// TODO: Try to hoist the arguments (recursively) instead of giving up
1576+
// immediately.
1577+
bool MoveSrc = !DT->dominates(SrcAlloca, DestAlloca);
1578+
if (MoveSrc) {
1579+
if (!DT->dominates(SrcAlloca->getArraySize(), DestAlloca))
1580+
return false;
1581+
} else {
1582+
if (!DT->dominates(DestAlloca->getArraySize(), SrcAlloca))
1583+
return false;
1584+
}
1585+
15431586
// Check that src and dest are never captured, unescaped allocas. Also
15441587
// find the nearest common dominator and postdominator for all users in
15451588
// order to shrink wrap the lifetimes, and instructions with noalias metadata
15461589
// to remove them.
15471590

15481591
SmallVector<Instruction *, 4> LifetimeMarkers;
15491592
SmallSet<Instruction *, 4> AAMetadataInstrs;
1550-
bool SrcNotDom = false;
15511593

15521594
auto CaptureTrackingWithModRef =
15531595
[&](Instruction *AI,
@@ -1561,10 +1603,6 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
15611603
Instruction *I = Worklist.pop_back_val();
15621604
for (const Use &U : I->uses()) {
15631605
auto *UI = cast<Instruction>(U.getUser());
1564-
// If any use that isn't dominated by SrcAlloca exists, we move src
1565-
// alloca to the entry before the transformation.
1566-
if (!DT->dominates(SrcAlloca, UI))
1567-
SrcNotDom = true;
15681606

15691607
if (Visited.size() >= MaxUsesToExplore) {
15701608
LLVM_DEBUG(
@@ -1678,15 +1716,43 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
16781716
if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
16791717
return false;
16801718

1681-
// We can do the transformation. First, move the SrcAlloca to the start of the
1682-
// BB.
1683-
if (SrcNotDom)
1684-
SrcAlloca->moveBefore(*SrcAlloca->getParent(),
1685-
SrcAlloca->getParent()->getFirstInsertionPt());
1719+
// We can now do the transformation. First move the Src if it was after Dest.
1720+
if (MoveSrc)
1721+
SrcAlloca->moveBefore(DestAlloca->getIterator());
1722+
16861723
// Align the allocas appropriately.
16871724
SrcAlloca->setAlignment(
16881725
std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
16891726

1727+
// Size the allocas appropriately.
1728+
Value *SrcArraySize = SrcAlloca->getArraySize();
1729+
Value *DestArraySize = DestAlloca->getArraySize();
1730+
IRBuilder<InstSimplifyFolder> Builder(SrcAlloca->getContext(),
1731+
InstSimplifyFolder(DL));
1732+
Builder.SetInsertPoint(SrcAlloca);
1733+
Type *Int32Ty = Builder.getInt32Ty();
1734+
if (SrcType != DestType && SrcSize != DestSize) {
1735+
SrcAlloca->setAllocatedType(Type::getInt8Ty(Load->getContext()));
1736+
if (SrcArraySize->getType() != Int32Ty)
1737+
SrcArraySize = Builder.CreateZExtOrTrunc(SrcArraySize, Int32Ty);
1738+
if (DestArraySize->getType() != Int32Ty)
1739+
DestArraySize = Builder.CreateZExtOrTrunc(DestArraySize, Int32Ty);
1740+
SrcArraySize = Builder.CreateMul(
1741+
SrcArraySize, ConstantInt::get(Int32Ty, SrcSize.getFixedValue()), "",
1742+
true, true);
1743+
DestArraySize = Builder.CreateMul(
1744+
DestArraySize, ConstantInt::get(Int32Ty, DestSize.getFixedValue()), "",
1745+
true, true);
1746+
}
1747+
if (SrcArraySize != DestArraySize) {
1748+
if (SrcArraySize->getType() != DestArraySize->getType()) {
1749+
SrcArraySize = Builder.CreateZExtOrTrunc(SrcArraySize, Int32Ty);
1750+
DestArraySize = Builder.CreateZExtOrTrunc(DestArraySize, Int32Ty);
1751+
}
1752+
SrcAlloca->setOperand(0, Builder.CreateBinaryIntrinsic(
1753+
Intrinsic::umax, SrcArraySize, DestArraySize));
1754+
}
1755+
16901756
// Merge the two allocas.
16911757
DestAlloca->replaceAllUsesWith(SrcAlloca);
16921758
eraseInstruction(DestAlloca);

llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,9 @@ declare void @decompose(ptr nocapture)
7676
define void @test5(ptr %ptr) {
7777
; CHECK-LABEL: @test5(
7878
; CHECK-NEXT: entry:
79-
; CHECK-NEXT: [[EARLY_DATA:%.*]] = alloca [128 x i8], align 8
80-
; CHECK-NEXT: [[TMP:%.*]] = alloca [[T:%.*]], align 8
81-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[EARLY_DATA]])
79+
; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i32 8224, align 8
8280
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 8
8381
; CHECK-NEXT: call fastcc void @decompose(ptr [[TMP]])
84-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[EARLY_DATA]], ptr [[TMP]], i64 32, i1 false)
8582
; CHECK-NEXT: ret void
8683
;
8784
entry:

llvm/test/Transforms/MemCpyOpt/stack-move.ll

Lines changed: 7 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,16 +1029,10 @@ bb2:
10291029
; for the purposes of liveness analysis, not a definition.
10301030
define void @incomplete_memcpy() {
10311031
; CHECK-LABEL: define void @incomplete_memcpy() {
1032-
; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
1033-
; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
1034-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[SRC]])
1035-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[DEST]])
1036-
; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
1037-
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[SRC]])
1038-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 11, i1 false)
1032+
; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
1033+
; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
1034+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
10391035
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
1040-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[SRC]])
1041-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
10421036
; CHECK-NEXT: ret void
10431037
;
10441038
%src = alloca %struct.Foo, align 4
@@ -1058,17 +1052,10 @@ define void @incomplete_memcpy() {
10581052
; for the purposes of liveness analysis, not a definition.
10591053
define void @incomplete_store() {
10601054
; CHECK-LABEL: define void @incomplete_store() {
1061-
; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
1062-
; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
1063-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[SRC]])
1064-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[DEST]])
1065-
; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
1066-
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[SRC]])
1067-
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC]], align 4
1068-
; CHECK-NEXT: store i32 [[TMP2]], ptr [[DEST]], align 4
1055+
; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
1056+
; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
1057+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
10691058
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
1070-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[SRC]])
1071-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
10721059
; CHECK-NEXT: ret void
10731060
;
10741061
%src = alloca %struct.Foo, align 4
@@ -1182,15 +1169,9 @@ define void @dynamically_sized_memcpy(i64 %size) {
11821169
define void @mismatched_alloca_size() {
11831170
; CHECK-LABEL: define void @mismatched_alloca_size() {
11841171
; CHECK-NEXT: [[SRC:%.*]] = alloca i8, i64 24, align 4
1185-
; CHECK-NEXT: [[DEST:%.*]] = alloca i8, i64 12, align 4
1186-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr captures(none) [[SRC]])
1187-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[DEST]])
11881172
; CHECK-NEXT: store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
11891173
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
1190-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
1191-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
1192-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 24, ptr captures(none) [[SRC]])
1193-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
1174+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
11941175
; CHECK-NEXT: ret void
11951176
;
11961177
%src = alloca i8, i64 24, align 4

0 commit comments

Comments
 (0)