Skip to content

Commit 16189f1

Browse files
committed
[memcpyopt] fix incorrect handling of lifetime markers
Having lifetime markers should only increase the information available to LLVM, but it was stopping the MemSSA walk and unable to see that these were extra. Secondly, it would ignore a lifetime marker that wasn't full size (relying on the callback to forbid the optimization entirely), but again sub-optimal lifetime markers are not supposed to be forbidding optimizations that would otherwise apply if they were either absent or optimal. This pass wasn't tracking GEP offsets either, so it wasn't quite correctly handled either (although earlier sub-optimal checks that this size is the same as the alloca test made this safe in the past). Lastly, the stack-move test seemed to have been providing lifetime in bits instead of bytes, although now we optimize it regardless of that, so we didn't really have to fix that.
1 parent b42aef5 commit 16189f1

File tree

5 files changed

+84
-73
lines changed

5 files changed

+84
-73
lines changed

llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Lines changed: 57 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,56 +1366,65 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
13661366

13671367
/// Determine whether the pointer V had only undefined content (due to Def) up
13681368
/// to the given Size, either because it was freshly alloca'd or started its
1369-
/// lifetime.
1370-
static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
1371-
MemoryDef *Def, Value *Size) {
1372-
if (MSSA->isLiveOnEntryDef(Def))
1373-
return isa<AllocaInst>(getUnderlyingObject(V));
1374-
1375-
if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
1376-
if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
1377-
auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
1378-
1379-
if (auto *CSize = dyn_cast<ConstantInt>(Size)) {
1380-
if (AA.isMustAlias(V, II->getArgOperand(1)) &&
1381-
LTSize->getZExtValue() >= CSize->getZExtValue())
1382-
return true;
1383-
}
1369+
/// lifetime by walking the MSSA graph.
1370+
static bool hadUndefContentsBefore(MemorySSA *MSSA, BatchAAResults &BAA, Value *V,
1371+
MemoryAccess *Clobber, MemoryLocation Loc, Value *Size) {
1372+
while (1) {
1373+
Clobber = MSSA->getWalker()->getClobberingMemoryAccess(Clobber, Loc, BAA);
1374+
MemoryDef *Def = dyn_cast<MemoryDef>(Clobber);
1375+
if (!Def)
1376+
return false;
1377+
1378+
if (MSSA->isLiveOnEntryDef(Def))
1379+
return isa<AllocaInst>(getUnderlyingObject(V));
1380+
1381+
if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
1382+
if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
1383+
auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
13841384

1385-
// If the lifetime.start covers a whole alloca (as it almost always
1386-
// does) and we're querying a pointer based on that alloca, then we know
1387-
// the memory is definitely undef, regardless of how exactly we alias.
1388-
// The size also doesn't matter, as an out-of-bounds access would be UB.
1389-
if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
1390-
if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
1391-
const DataLayout &DL = Alloca->getDataLayout();
1392-
if (std::optional<TypeSize> AllocaSize =
1393-
Alloca->getAllocationSize(DL))
1394-
if (*AllocaSize == LTSize->getValue())
1385+
if (Size)
1386+
if (auto CSize = dyn_cast<ConstantInt>(Size))
1387+
if (BAA.isMustAlias(V, II->getArgOperand(1)) &&
1388+
LTSize->getZExtValue() >= CSize->getZExtValue())
13951389
return true;
1390+
1391+
// If the lifetime.start covers a whole alloca (as it almost always
1392+
// does) and we're querying a pointer based on that alloca, then we know
1393+
// the memory is definitely undef, regardless of how exactly we alias.
1394+
// The size also doesn't matter, as an out-of-bounds access would be UB.
1395+
if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
1396+
if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
1397+
const DataLayout &DL = Alloca->getDataLayout();
1398+
if (std::optional<TypeSize> AllocaSize =
1399+
Alloca->getAllocationSize(DL))
1400+
if (*AllocaSize == LTSize->getValue())
1401+
return true;
1402+
}
13961403
}
1404+
Clobber = Def->getDefiningAccess();
1405+
continue;
1406+
} else if (II->getIntrinsicID() == Intrinsic::lifetime_end) {
1407+
Clobber = Def->getDefiningAccess();
1408+
continue;
13971409
}
13981410
}
1399-
}
14001411

1401-
return false;
1412+
return false;
1413+
}
14021414
}
14031415

14041416
// If the memcpy is larger than the previous, but the memory was undef prior to
14051417
// that, we can just ignore the tail. Technically we're only interested in the
14061418
// bytes from 0..MemSrcOffset and MemSrcLength+MemSrcOffset..CopySize here, but
1407-
// as we can't easily represent this location (hasUndefContents uses mustAlias
1419+
// as we can't easily represent this location (hadUndefContentsBefore uses mustAlias
14081420
// which cannot deal with offsets), we use the full 0..CopySize range.
14091421
static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy,
14101422
MemIntrinsic *MemSrc, BatchAAResults &BAA) {
14111423
Value *CopySize = MemCpy->getLength();
1412-
MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
1413-
MemoryUseOrDef *MemSrcAccess = MSSA->getMemoryAccess(MemSrc);
1414-
MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
1415-
MemSrcAccess->getDefiningAccess(), MemCpyLoc, BAA);
1416-
if (auto *MD = dyn_cast<MemoryDef>(Clobber))
1417-
if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize))
1418-
return true;
1424+
MemoryLocation LoadLoc = MemoryLocation::getForSource(MemCpy);
1425+
MemoryAccess *MemSrcAccess = MSSA->getMemoryAccess(MemSrc)->getDefiningAccess();
1426+
if (hadUndefContentsBefore(MSSA, BAA, MemCpy->getSource(), MemSrcAccess, LoadLoc, CopySize))
1427+
return true;
14191428
return false;
14201429
}
14211430

@@ -1573,11 +1582,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
15731582
// since both llvm.lifetime.start and llvm.lifetime.end intrinsics
15741583
// practically fill all the bytes of the alloca with an undefined
15751584
// value, although conceptually marked as alive/dead.
1576-
int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
1577-
if (Size < 0 || Size == DestSize) {
1578-
LifetimeMarkers.push_back(UI);
1579-
continue;
1580-
}
1585+
// We don't currently track GEP offsets and sizes, so we don't have
1586+
// a way to check whether this lifetime marker affects the relevant
1587+
// memory regions.
1588+
// While we only really need to delete lifetime.end from Src and
1589+
// lifetime.begin from Dst, those are often implied by the memcpy
1590+
// anyways so hopefully not much is lost by removing all of them.
1591+
LifetimeMarkers.push_back(UI);
1592+
continue;
15811593
}
15821594
AAMetadataInstrs.insert(UI);
15831595

@@ -1594,9 +1606,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
15941606
return true;
15951607
};
15961608

1597-
// Check that dest has no Mod/Ref, from the alloca to the Store, except full
1598-
// size lifetime intrinsics. And collect modref inst for the reachability
1599-
// check.
1609+
// Check that dest has no Mod/Ref, from the alloca to the Store. And collect
1610+
// modref inst for the reachability check.
16001611
ModRefInfo DestModRef = ModRefInfo::NoModRef;
16011612
MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
16021613
SmallVector<BasicBlock *, 8> ReachabilityWorklist;
@@ -1779,8 +1790,9 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
17791790
if (processMemSetMemCpyDependence(M, MDep, BAA))
17801791
return true;
17811792

1793+
MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
17821794
MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
1783-
AnyClobber, MemoryLocation::getForSource(M), BAA);
1795+
AnyClobber, SrcLoc, BAA);
17841796

17851797
// There are five possible optimizations we can do for memcpy:
17861798
// a) memcpy-memcpy xform which exposes redundance for DSE.
@@ -1820,7 +1832,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
18201832
}
18211833
}
18221834

1823-
if (hasUndefContents(MSSA, BAA, M->getSource(), MD, M->getLength())) {
1835+
if (hadUndefContentsBefore(MSSA, BAA, M->getSource(), AnyClobber, SrcLoc, M->getLength())) {
18241836
LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
18251837
eraseInstruction(M);
18261838
++NumMemCpyInstr;

llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,27 @@
1-
; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -S -passes=memcpyopt | FileCheck %s
23

34
; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains
45
define i8 @test(i8 %input) {
6+
; CHECK-LABEL: define i8 @test(
7+
; CHECK-SAME: i8 [[INPUT:%.*]]) {
8+
; CHECK-NEXT: [[SRC:%.*]] = alloca i8, align 1
9+
; CHECK-NEXT: store i8 [[INPUT]], ptr [[SRC]], align 1
10+
; CHECK-NEXT: [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
11+
; CHECK-NEXT: ret i8 [[RET_VALUE]]
12+
;
513
%tmp = alloca i8
614
%dst = alloca i8
715
%src = alloca i8
8-
; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]]
9-
call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4
16+
call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !4
1017
store i8 %input, ptr %src
1118
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
12-
call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !4
19+
call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !4
1320
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !4
1421
%ret_value = load i8, ptr %dst
1522
ret i8 %ret_value
1623
}
1724

18-
; Merged scope contains "callee0: %a" and "callee0 : %b"
19-
; CHECK-DAG: ![[CALLEE0_A:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %a"}
20-
; CHECK-DAG: ![[CALLEE0_B:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %b"}
21-
; CHECK-DAG: ![[SCOPE]] = !{![[CALLEE0_A]], ![[CALLEE0_B]]}
22-
2325
declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
2426
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
2527
declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)

llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -S -passes=memcpyopt | FileCheck %s
23

34
; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions.
45
; Merging here naively generates:
56
; call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope !3
6-
; call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !0
7+
; call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !0
78
; ...
89
; !0 = !{!1}
910
; !1 = distinct !{!1, !2, !"callee1: %a"}
@@ -13,15 +14,20 @@
1314
; !5 = distinct !{!5, !"callee0"}
1415
; Which is incorrect because the lifetime.end of %src will now "noalias" the above memcpy.
1516
define i8 @test(i8 %input) {
17+
; CHECK-LABEL: define i8 @test(
18+
; CHECK-SAME: i8 [[INPUT:%.*]]) {
19+
; CHECK-NEXT: [[SRC:%.*]] = alloca i8, align 1
20+
; CHECK-NEXT: store i8 [[INPUT]], ptr [[SRC]], align 1
21+
; CHECK-NEXT: [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
22+
; CHECK-NEXT: ret i8 [[RET_VALUE]]
23+
;
1624
%tmp = alloca i8
1725
%dst = alloca i8
1826
%src = alloca i8
19-
; NOTE: we're matching the full line and looking for the lack of !alias.scope here
20-
; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false)
21-
call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !3
27+
call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !3
2228
store i8 %input, ptr %src
2329
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
24-
call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !3
30+
call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !3
2531
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !3
2632
%ret_value = load i8, ptr %dst
2733
ret i8 %ret_value

llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ define void @test_lifetime_partial_alias_3(ptr noalias %dst) {
9696
; CHECK-NEXT: [[A:%.*]] = alloca [16 x i8], align 1
9797
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[A]])
9898
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 8
99-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[GEP]], i64 4, i1 false)
10099
; CHECK-NEXT: ret void
101100
;
102101
%a = alloca [16 x i8]
@@ -112,7 +111,6 @@ define void @test_lifetime_partial_alias_4(ptr noalias %dst) {
112111
; CHECK-NEXT: [[A:%.*]] = alloca [16 x i8], align 1
113112
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[A]])
114113
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 8
115-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[GEP]], i64 8, i1 false)
116114
; CHECK-NEXT: ret void
117115
;
118116
%a = alloca [16 x i8]

llvm/test/Transforms/MemCpyOpt/stack-move.ll

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1649,20 +1649,13 @@ loop_exit:
16491649
ret void
16501650
}
16511651

1652-
; Tests that failure because partial-sized lifetimes are counted as mod.
1652+
; Tests that partial-sized lifetimes are not inhibiting the optimizer
16531653
define void @partial_lifetime() {
16541654
; CHECK-LABEL: define void @partial_lifetime() {
1655-
; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
1656-
; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
1657-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[SRC]])
1658-
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr captures(none) [[DEST]])
1659-
; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
1660-
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
1661-
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
1662-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 3, ptr captures(none) [[SRC]])
1655+
; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
1656+
; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
1657+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
16631658
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
1664-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[SRC]])
1665-
; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
16661659
; CHECK-NEXT: ret void
16671660
;
16681661
%src = alloca %struct.Foo, align 4

0 commit comments

Comments
 (0)