Skip to content

Commit a2f0414

Browse files
committed
[memcpyopt] fix incorrect handling of lifetime markers
Having lifetime markers should only increase the information available to LLVM, but it was stopping the MemSSA walk and unable to see that these were extra. Secondly, it would ignore a lifetime marker that wasn't full size (relying on the callback to forbid the optimization entirely), but again sub-optimal lifetime markers are not supposed to be forbidding optimizations that would otherwise apply if they were either absent or optimal. This pass wasn't tracking GEP offsets either, so it wasn't quite correctly handled either (although earlier sub-optimal checks that this size is the same as the alloca test made this safe in the past). Lastly, the stack-move test seemed to have been providing lifetime in bits instead of bytes, although now we optimize it regardless of that, so we didn't really have to fix that.
1 parent b42aef5 commit a2f0414

File tree

5 files changed

+90
-75
lines changed

5 files changed

+90
-75
lines changed

llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Lines changed: 63 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,56 +1366,68 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
13661366

13671367
/// Determine whether the pointer V had only undefined content (due to Def) up
13681368
/// to the given Size, either because it was freshly alloca'd or started its
1369-
/// lifetime.
1370-
static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
1371-
MemoryDef *Def, Value *Size) {
1372-
if (MSSA->isLiveOnEntryDef(Def))
1373-
return isa<AllocaInst>(getUnderlyingObject(V));
1374-
1375-
if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
1376-
if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
1377-
auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
1378-
1379-
if (auto *CSize = dyn_cast<ConstantInt>(Size)) {
1380-
if (AA.isMustAlias(V, II->getArgOperand(1)) &&
1381-
LTSize->getZExtValue() >= CSize->getZExtValue())
1382-
return true;
1383-
}
1369+
/// lifetime by walking the MSSA graph.
1370+
static bool hadUndefContentsBefore(MemorySSA *MSSA, BatchAAResults &BAA,
1371+
Value *V, MemoryAccess *Clobber,
1372+
MemoryLocation Loc, Value *Size) {
1373+
while (1) {
1374+
Clobber = MSSA->getWalker()->getClobberingMemoryAccess(Clobber, Loc, BAA);
1375+
MemoryDef *Def = dyn_cast<MemoryDef>(Clobber);
1376+
if (!Def)
1377+
return false;
1378+
1379+
if (MSSA->isLiveOnEntryDef(Def))
1380+
return isa<AllocaInst>(getUnderlyingObject(V));
1381+
1382+
if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
1383+
if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
1384+
auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
13841385

1385-
// If the lifetime.start covers a whole alloca (as it almost always
1386-
// does) and we're querying a pointer based on that alloca, then we know
1387-
// the memory is definitely undef, regardless of how exactly we alias.
1388-
// The size also doesn't matter, as an out-of-bounds access would be UB.
1389-
if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
1390-
if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
1391-
const DataLayout &DL = Alloca->getDataLayout();
1392-
if (std::optional<TypeSize> AllocaSize =
1393-
Alloca->getAllocationSize(DL))
1394-
if (*AllocaSize == LTSize->getValue())
1386+
if (Size)
1387+
if (auto CSize = dyn_cast<ConstantInt>(Size))
1388+
if (BAA.isMustAlias(V, II->getArgOperand(1)) &&
1389+
LTSize->getZExtValue() >= CSize->getZExtValue())
13951390
return true;
1391+
1392+
// If the lifetime.start covers a whole alloca (as it almost always
1393+
// does) and we're querying a pointer based on that alloca, then we know
1394+
// the memory is definitely undef, regardless of how exactly we alias.
1395+
// The size also doesn't matter, as an out-of-bounds access would be UB.
1396+
if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
1397+
if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
1398+
const DataLayout &DL = Alloca->getDataLayout();
1399+
if (std::optional<TypeSize> AllocaSize =
1400+
Alloca->getAllocationSize(DL))
1401+
if (*AllocaSize == LTSize->getValue())
1402+
return true;
1403+
}
13961404
}
1405+
Clobber = Def->getDefiningAccess();
1406+
continue;
1407+
} else if (II->getIntrinsicID() == Intrinsic::lifetime_end) {
1408+
Clobber = Def->getDefiningAccess();
1409+
continue;
13971410
}
13981411
}
1399-
}
14001412

1401-
return false;
1413+
return false;
1414+
}
14021415
}
14031416

14041417
// If the memcpy is larger than the previous, but the memory was undef prior to
14051418
// that, we can just ignore the tail. Technically we're only interested in the
14061419
// bytes from 0..MemSrcOffset and MemSrcLength+MemSrcOffset..CopySize here, but
1407-
// as we can't easily represent this location (hasUndefContents uses mustAlias
1408-
// which cannot deal with offsets), we use the full 0..CopySize range.
1420+
// as we can't easily represent this location (hadUndefContentsBefore uses
1421+
// mustAlias which cannot deal with offsets), we use the full 0..CopySize range.
14091422
static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy,
14101423
MemIntrinsic *MemSrc, BatchAAResults &BAA) {
14111424
Value *CopySize = MemCpy->getLength();
1412-
MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
1413-
MemoryUseOrDef *MemSrcAccess = MSSA->getMemoryAccess(MemSrc);
1414-
MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
1415-
MemSrcAccess->getDefiningAccess(), MemCpyLoc, BAA);
1416-
if (auto *MD = dyn_cast<MemoryDef>(Clobber))
1417-
if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize))
1418-
return true;
1425+
MemoryLocation LoadLoc = MemoryLocation::getForSource(MemCpy);
1426+
MemoryAccess *MemSrcAccess =
1427+
MSSA->getMemoryAccess(MemSrc)->getDefiningAccess();
1428+
if (hadUndefContentsBefore(MSSA, BAA, MemCpy->getSource(), MemSrcAccess,
1429+
LoadLoc, CopySize))
1430+
return true;
14191431
return false;
14201432
}
14211433

@@ -1573,11 +1585,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
15731585
// since both llvm.lifetime.start and llvm.lifetime.end intrinsics
15741586
// practically fill all the bytes of the alloca with an undefined
15751587
// value, although conceptually marked as alive/dead.
1576-
int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
1577-
if (Size < 0 || Size == DestSize) {
1578-
LifetimeMarkers.push_back(UI);
1579-
continue;
1580-
}
1588+
// We don't currently track GEP offsets and sizes, so we don't have
1589+
// a way to check whether this lifetime marker affects the relevant
1590+
// memory regions.
1591+
// While we only really need to delete lifetime.end from Src and
1592+
// lifetime.begin from Dst, those are often implied by the memcpy
1593+
// anyways so hopefully not much is lost by removing all of them.
1594+
LifetimeMarkers.push_back(UI);
1595+
continue;
15811596
}
15821597
AAMetadataInstrs.insert(UI);
15831598

@@ -1594,9 +1609,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
15941609
return true;
15951610
};
15961611

1597-
// Check that dest has no Mod/Ref, from the alloca to the Store, except full
1598-
// size lifetime intrinsics. And collect modref inst for the reachability
1599-
// check.
1612+
// Check that dest has no Mod/Ref, from the alloca to the Store. And collect
1613+
// modref inst for the reachability check.
16001614
ModRefInfo DestModRef = ModRefInfo::NoModRef;
16011615
MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
16021616
SmallVector<BasicBlock *, 8> ReachabilityWorklist;
@@ -1779,8 +1793,9 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
17791793
if (processMemSetMemCpyDependence(M, MDep, BAA))
17801794
return true;
17811795

1782-
MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
1783-
AnyClobber, MemoryLocation::getForSource(M), BAA);
1796+
MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
1797+
MemoryAccess *SrcClobber =
1798+
MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, SrcLoc, BAA);
17841799

17851800
// There are five possible optimizations we can do for memcpy:
17861801
// a) memcpy-memcpy xform which exposes redundance for DSE.
@@ -1820,7 +1835,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
18201835
}
18211836
}
18221837

1823-
if (hasUndefContents(MSSA, BAA, M->getSource(), MD, M->getLength())) {
1838+
if (hadUndefContentsBefore(MSSA, BAA, M->getSource(), AnyClobber, SrcLoc,
1839+
M->getLength())) {
18241840
LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
18251841
eraseInstruction(M);
18261842
++NumMemCpyInstr;

llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,27 @@
1-
; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -S -passes=memcpyopt | FileCheck %s
23

34
; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains
45
define i8 @test(i8 %input) {
6+
; CHECK-LABEL: define i8 @test(
7+
; CHECK-SAME: i8 [[INPUT:%.*]]) {
8+
; CHECK-NEXT: [[SRC:%.*]] = alloca i8, align 1
9+
; CHECK-NEXT: store i8 [[INPUT]], ptr [[SRC]], align 1
10+
; CHECK-NEXT: [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
11+
; CHECK-NEXT: ret i8 [[RET_VALUE]]
12+
;
513
%tmp = alloca i8
614
%dst = alloca i8
715
%src = alloca i8
8-
; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]]
9-
call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4
16+
call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !4
1017
store i8 %input, ptr %src
1118
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
12-
call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !4
19+
call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !4
1320
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !4
1421
%ret_value = load i8, ptr %dst
1522
ret i8 %ret_value
1623
}
1724

18-
; Merged scope contains "callee0: %a" and "callee0 : %b"
19-
; CHECK-DAG: ![[CALLEE0_A:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %a"}
20-
; CHECK-DAG: ![[CALLEE0_B:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %b"}
21-
; CHECK-DAG: ![[SCOPE]] = !{![[CALLEE0_A]], ![[CALLEE0_B]]}
22-
2325
declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
2426
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
2527
declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)

llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -S -passes=memcpyopt | FileCheck %s
23

34
; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions.
45
; Merging here naively generates:
56
; call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope !3
6-
; call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !0
7+
; call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !0
78
; ...
89
; !0 = !{!1}
910
; !1 = distinct !{!1, !2, !"callee1: %a"}
@@ -13,15 +14,20 @@
1314
; !5 = distinct !{!5, !"callee0"}
1415
; Which is incorrect because the lifetime.end of %src will now "noalias" the above memcpy.
1516
define i8 @test(i8 %input) {
17+
; CHECK-LABEL: define i8 @test(
18+
; CHECK-SAME: i8 [[INPUT:%.*]]) {
19+
; CHECK-NEXT: [[SRC:%.*]] = alloca i8, align 1
20+
; CHECK-NEXT: store i8 [[INPUT]], ptr [[SRC]], align 1
21+
; CHECK-NEXT: [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
22+
; CHECK-NEXT: ret i8 [[RET_VALUE]]
23+
;
1624
%tmp = alloca i8
1725
%dst = alloca i8
1826
%src = alloca i8
19-
; NOTE: we're matching the full line and looking for the lack of !alias.scope here
20-
; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false)
21-
call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !3
27+
call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !3
2228
store i8 %input, ptr %src
2329
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
24-
call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !3
30+
call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !3
2531
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !3
2632
%ret_value = load i8, ptr %dst
2733
ret i8 %ret_value

llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ define void @test_lifetime_partial_alias_3(ptr noalias %dst) {
9696
; CHECK-NEXT: [[A:%.*]] = alloca [16 x i8], align 1
9797
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[A]])
9898
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 8
99-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[GEP]], i64 4, i1 false)
10099
; CHECK-NEXT: ret void
101100
;
102101
%a = alloca [16 x i8]
@@ -112,7 +111,6 @@ define void @test_lifetime_partial_alias_4(ptr noalias %dst) {
112111
; CHECK-NEXT: [[A:%.*]] = alloca [16 x i8], align 1
113112
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[A]])
114113
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 8
115-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[GEP]], i64 8, i1 false)
116114
; CHECK-NEXT: ret void
117115
;
118116
%a = alloca [16 x i8]

llvm/test/Transforms/MemCpyOpt/stack-move.ll

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1649,20 +1649,13 @@ loop_exit:
16491649
ret void
16501650
}
16511651

1652-
; Tests that failure because partial-sized lifetimes are counted as mod.
1652+
; Tests that partial-sized lifetimes are not inhibiting the optimizer
16531653
define void @partial_lifetime() {
16541654
; CHECK-LABEL: define void @partial_lifetime() {
1655-
; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
1656-
; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
1657-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[SRC]])
1658-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr captures(none) [[DEST]])
1659-
; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
1660-
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
1661-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
1662-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 3, ptr captures(none) [[SRC]])
1655+
; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
1656+
; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
1657+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
16631658
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
1664-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[SRC]])
1665-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
16661659
; CHECK-NEXT: ret void
16671660
;
16681661
%src = alloca %struct.Foo, align 4

0 commit comments

Comments
 (0)