Skip to content

Commit 96ab74b

Browse files
authored
[InstCombine] remove undef loads, such as memcpy from undef (#143958)
Extend `isAllocSiteRemovable` to be able to check if the ModRef info indicates the alloca is only Ref or only Mod, and be able to remove it accordingly. It seemed that there were a surprising number of benchmarks with this pattern which weren't getting optimized previously (due to MemorySSA walk limits). There were somewhat more existing tests than I'd like to have modified which were simply doing exactly this pattern (and thus relying on undef memory). Claude code contributed the new tests (and found an important typo that I'd made). This implements the discussion in #143782 (comment).
1 parent f242360 commit 96ab74b

14 files changed

+329
-111
lines changed

clang/test/Misc/loop-opt-setup.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ int foo(void) {
1515
// CHECK-NOT: br i1
1616

1717
void Helper(void) {
18-
const int *nodes[5];
18+
const int *nodes[5] = {0};
1919
int num_active = 5;
2020

2121
while (num_active)

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Lines changed: 87 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3277,12 +3277,13 @@ static bool isRemovableWrite(CallBase &CB, Value *UsedV,
32773277
return Dest && Dest->Ptr == UsedV;
32783278
}
32793279

3280-
static bool isAllocSiteRemovable(Instruction *AI,
3281-
SmallVectorImpl<WeakTrackingVH> &Users,
3282-
const TargetLibraryInfo &TLI) {
3280+
static std::optional<ModRefInfo>
3281+
isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakTrackingVH> &Users,
3282+
const TargetLibraryInfo &TLI, bool KnowInit) {
32833283
SmallVector<Instruction*, 4> Worklist;
32843284
const std::optional<StringRef> Family = getAllocationFamily(AI, &TLI);
32853285
Worklist.push_back(AI);
3286+
ModRefInfo Access = KnowInit ? ModRefInfo::NoModRef : ModRefInfo::Mod;
32863287

32873288
do {
32883289
Instruction *PI = Worklist.pop_back_val();
@@ -3291,7 +3292,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
32913292
switch (I->getOpcode()) {
32923293
default:
32933294
// Give up the moment we see something we can't handle.
3294-
return false;
3295+
return std::nullopt;
32953296

32963297
case Instruction::AddrSpaceCast:
32973298
case Instruction::BitCast:
@@ -3306,10 +3307,10 @@ static bool isAllocSiteRemovable(Instruction *AI,
33063307
// We also fold comparisons in some conditions provided the alloc has
33073308
// not escaped (see isNeverEqualToUnescapedAlloc).
33083309
if (!ICI->isEquality())
3309-
return false;
3310+
return std::nullopt;
33103311
unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0;
33113312
if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI))
3312-
return false;
3313+
return std::nullopt;
33133314

33143315
// Do not fold compares to aligned_alloc calls, as they may have to
33153316
// return null in case the required alignment cannot be satisfied,
@@ -3329,7 +3330,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
33293330
if (CB && TLI.getLibFunc(*CB->getCalledFunction(), TheLibFunc) &&
33303331
TLI.has(TheLibFunc) && TheLibFunc == LibFunc_aligned_alloc &&
33313332
!AlignmentAndSizeKnownValid(CB))
3332-
return false;
3333+
return std::nullopt;
33333334
Users.emplace_back(I);
33343335
continue;
33353336
}
@@ -3339,14 +3340,21 @@ static bool isAllocSiteRemovable(Instruction *AI,
33393340
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
33403341
switch (II->getIntrinsicID()) {
33413342
default:
3342-
return false;
3343+
return std::nullopt;
33433344

33443345
case Intrinsic::memmove:
33453346
case Intrinsic::memcpy:
33463347
case Intrinsic::memset: {
33473348
MemIntrinsic *MI = cast<MemIntrinsic>(II);
3348-
if (MI->isVolatile() || MI->getRawDest() != PI)
3349-
return false;
3349+
if (MI->isVolatile())
3350+
return std::nullopt;
3351+
// Note: this could also be ModRef, but we can still interpret that
3352+
// as just Mod in that case.
3353+
ModRefInfo NewAccess =
3354+
MI->getRawDest() == PI ? ModRefInfo::Mod : ModRefInfo::Ref;
3355+
if ((Access & ~NewAccess) != ModRefInfo::NoModRef)
3356+
return std::nullopt;
3357+
Access |= NewAccess;
33503358
[[fallthrough]];
33513359
}
33523360
case Intrinsic::assume:
@@ -3365,11 +3373,6 @@ static bool isAllocSiteRemovable(Instruction *AI,
33653373
}
33663374
}
33673375

3368-
if (isRemovableWrite(*cast<CallBase>(I), PI, TLI)) {
3369-
Users.emplace_back(I);
3370-
continue;
3371-
}
3372-
33733376
if (Family && getFreedOperand(cast<CallBase>(I), &TLI) == PI &&
33743377
getAllocationFamily(I, &TLI) == Family) {
33753378
Users.emplace_back(I);
@@ -3383,20 +3386,43 @@ static bool isAllocSiteRemovable(Instruction *AI,
33833386
continue;
33843387
}
33853388

3386-
return false;
3389+
if (!isRefSet(Access) &&
3390+
isRemovableWrite(*cast<CallBase>(I), PI, TLI)) {
3391+
Access |= ModRefInfo::Mod;
3392+
Users.emplace_back(I);
3393+
continue;
3394+
}
3395+
3396+
return std::nullopt;
33873397

33883398
case Instruction::Store: {
33893399
StoreInst *SI = cast<StoreInst>(I);
33903400
if (SI->isVolatile() || SI->getPointerOperand() != PI)
3391-
return false;
3401+
return std::nullopt;
3402+
if (isRefSet(Access))
3403+
return std::nullopt;
3404+
Access |= ModRefInfo::Mod;
3405+
Users.emplace_back(I);
3406+
continue;
3407+
}
3408+
3409+
case Instruction::Load: {
3410+
LoadInst *LI = cast<LoadInst>(I);
3411+
if (LI->isVolatile() || LI->getPointerOperand() != PI)
3412+
return std::nullopt;
3413+
if (isModSet(Access))
3414+
return std::nullopt;
3415+
Access |= ModRefInfo::Ref;
33923416
Users.emplace_back(I);
33933417
continue;
33943418
}
33953419
}
33963420
llvm_unreachable("missing a return?");
33973421
}
33983422
} while (!Worklist.empty());
3399-
return true;
3423+
3424+
assert(Access != ModRefInfo::ModRef);
3425+
return Access;
34003426
}
34013427

34023428
Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
@@ -3424,10 +3450,31 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
34243450
DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
34253451
}
34263452

3427-
if (isAllocSiteRemovable(&MI, Users, TLI)) {
3453+
// Determine what getInitialValueOfAllocation would return without actually
3454+
// allocating the result.
3455+
bool KnowInitUndef = false;
3456+
bool KnowInitZero = false;
3457+
Constant *Init =
3458+
getInitialValueOfAllocation(&MI, &TLI, Type::getInt8Ty(MI.getContext()));
3459+
if (Init) {
3460+
if (isa<UndefValue>(Init))
3461+
KnowInitUndef = true;
3462+
else if (Init->isNullValue())
3463+
KnowInitZero = true;
3464+
}
3465+
// The various sanitizers don't actually return undef memory, but rather
3466+
// memory initialized with special forms of runtime poison
3467+
auto &F = *MI.getFunction();
3468+
if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
3469+
F.hasFnAttribute(Attribute::SanitizeAddress))
3470+
KnowInitUndef = false;
3471+
3472+
auto Removable =
3473+
isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef);
3474+
if (Removable) {
34283475
for (unsigned i = 0, e = Users.size(); i != e; ++i) {
3429-
// Lowering all @llvm.objectsize calls first because they may
3430-
// use a bitcast/GEP of the alloca we are removing.
3476+
// Lowering all @llvm.objectsize and MTI calls first because they may use
3477+
// a bitcast/GEP of the alloca we are removing.
34313478
if (!Users[i])
34323479
continue;
34333480

@@ -3444,6 +3491,17 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
34443491
eraseInstFromFunction(*I);
34453492
Users[i] = nullptr; // Skip examining in the next loop.
34463493
}
3494+
if (auto *MTI = dyn_cast<MemTransferInst>(I)) {
3495+
if (KnowInitZero && isRefSet(*Removable)) {
3496+
IRBuilderBase::InsertPointGuard Guard(Builder);
3497+
Builder.SetInsertPoint(MTI);
3498+
auto *M = Builder.CreateMemSet(
3499+
MTI->getRawDest(),
3500+
ConstantInt::get(Type::getInt8Ty(MI.getContext()), 0),
3501+
MTI->getLength(), MTI->getDestAlign());
3502+
M->copyMetadata(*MTI);
3503+
}
3504+
}
34473505
}
34483506
}
34493507
for (unsigned i = 0, e = Users.size(); i != e; ++i) {
@@ -3466,7 +3524,14 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
34663524
} else {
34673525
// Casts, GEP, or anything else: we're about to delete this instruction,
34683526
// so it can not have any valid uses.
3469-
replaceInstUsesWith(*I, PoisonValue::get(I->getType()));
3527+
Constant *Replace;
3528+
if (isa<LoadInst>(I)) {
3529+
assert(KnowInitZero || KnowInitUndef);
3530+
Replace = KnowInitUndef ? UndefValue::get(I->getType())
3531+
: Constant::getNullValue(I->getType());
3532+
} else
3533+
Replace = PoisonValue::get(I->getType());
3534+
replaceInstUsesWith(*I, Replace);
34703535
}
34713536
eraseInstFromFunction(*I);
34723537
}

llvm/test/Transforms/InstCombine/and-or-icmps.ll

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -364,23 +364,77 @@ define <2 x i1> @and_ne_with_diff_one_splatvec(<2 x i32> %x) {
364364

365365
define void @simplify_before_foldAndOfICmps(ptr %p) {
366366
; CHECK-LABEL: @simplify_before_foldAndOfICmps(
367-
; CHECK-NEXT: [[A8:%.*]] = alloca i16, align 2
368-
; CHECK-NEXT: [[L7:%.*]] = load i16, ptr [[A8]], align 2
367+
; CHECK-NEXT: store i1 true, ptr [[P:%.*]], align 1
368+
; CHECK-NEXT: store ptr null, ptr [[P]], align 8
369+
; CHECK-NEXT: ret void
370+
;
371+
%A8 = alloca i16
372+
%L7 = load i16, ptr %A8
373+
%G21 = getelementptr i16, ptr %A8, i8 -1
374+
%B11 = udiv i16 %L7, -1
375+
%G4 = getelementptr i16, ptr %A8, i16 %B11
376+
%L2 = load i16, ptr %G4
377+
%L = load i16, ptr %G4
378+
%B23 = mul i16 %B11, %B11
379+
%L4 = load i16, ptr %A8
380+
%B21 = sdiv i16 %L7, %L4
381+
%B7 = sub i16 0, %B21
382+
%B18 = mul i16 %B23, %B7
383+
%C10 = icmp ugt i16 %L, %B11
384+
%B20 = and i16 %L7, %L2
385+
%B1 = mul i1 %C10, true
386+
%C5 = icmp sle i16 %B21, %L
387+
%C11 = icmp ule i16 %B21, %L
388+
%C7 = icmp slt i16 %B20, 0
389+
%B29 = srem i16 %L4, %B18
390+
%B15 = add i1 %C7, %C10
391+
%B19 = add i1 %C11, %B15
392+
%C6 = icmp sge i1 %C11, %B19
393+
%B33 = or i16 %B29, %L4
394+
%C13 = icmp uge i1 %C5, %B1
395+
%C3 = icmp ult i1 %C13, %C6
396+
store i16 undef, ptr %G21
397+
%C18 = icmp ule i1 %C10, %C7
398+
%G26 = getelementptr i1, ptr null, i1 %C3
399+
store i16 %B33, ptr %p
400+
store i1 %C18, ptr %p
401+
store ptr %G26, ptr %p
402+
ret void
403+
}
404+
405+
define void @simplify_before_foldAndOfICmps2(ptr %p, ptr %A8) "instcombine-no-verify-fixpoint" {
406+
; CHECK-LABEL: @simplify_before_foldAndOfICmps2(
407+
; CHECK-NEXT: [[L7:%.*]] = load i16, ptr [[A8:%.*]], align 2
369408
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i16 [[L7]], -1
370409
; CHECK-NEXT: [[B11:%.*]] = zext i1 [[TMP1]] to i16
371-
; CHECK-NEXT: [[C10:%.*]] = icmp ugt i16 [[L7]], [[B11]]
372-
; CHECK-NEXT: [[C7:%.*]] = icmp slt i16 [[L7]], 0
373-
; CHECK-NEXT: [[C3:%.*]] = and i1 [[C7]], [[C10]]
374-
; CHECK-NEXT: [[TMP2:%.*]] = xor i1 [[C10]], true
375-
; CHECK-NEXT: [[C18:%.*]] = or i1 [[C7]], [[TMP2]]
376-
; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[C3]] to i64
410+
; CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[TMP1]] to i64
411+
; CHECK-NEXT: [[G4:%.*]] = getelementptr i16, ptr [[A8]], i64 [[TMP2]]
412+
; CHECK-NEXT: [[L2:%.*]] = load i16, ptr [[G4]], align 2
413+
; CHECK-NEXT: [[L4:%.*]] = load i16, ptr [[A8]], align 2
414+
; CHECK-NEXT: [[B21:%.*]] = sdiv i16 [[L7]], [[L4]]
415+
; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i16 [[B21]], i16 0
416+
; CHECK-NEXT: [[B18:%.*]] = sub i16 0, [[TMP5]]
417+
; CHECK-NEXT: [[C11:%.*]] = icmp ugt i16 [[L2]], [[B11]]
418+
; CHECK-NEXT: [[B20:%.*]] = and i16 [[L7]], [[L2]]
419+
; CHECK-NEXT: [[C5:%.*]] = icmp sgt i16 [[B21]], [[L2]]
420+
; CHECK-NEXT: [[C12:%.*]] = icmp ule i16 [[B21]], [[L2]]
421+
; CHECK-NEXT: [[C10:%.*]] = icmp slt i16 [[B20]], 0
422+
; CHECK-NEXT: [[B29:%.*]] = srem i16 [[L4]], [[B18]]
423+
; CHECK-NEXT: [[B15:%.*]] = xor i1 [[C10]], [[C11]]
424+
; CHECK-NEXT: [[TMP6:%.*]] = and i1 [[C12]], [[B15]]
425+
; CHECK-NEXT: [[C6:%.*]] = xor i1 [[TMP6]], true
426+
; CHECK-NEXT: [[B33:%.*]] = or i16 [[B29]], [[L4]]
427+
; CHECK-NEXT: [[C3:%.*]] = and i1 [[C5]], [[C6]]
428+
; CHECK-NEXT: [[C4:%.*]] = and i1 [[C3]], [[C11]]
429+
; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[C11]], true
430+
; CHECK-NEXT: [[C18:%.*]] = or i1 [[C10]], [[TMP4]]
431+
; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[C4]] to i64
377432
; CHECK-NEXT: [[G26:%.*]] = getelementptr i1, ptr null, i64 [[TMP3]]
378-
; CHECK-NEXT: store i16 [[L7]], ptr [[P:%.*]], align 2
433+
; CHECK-NEXT: store i16 [[B33]], ptr [[P:%.*]], align 2
379434
; CHECK-NEXT: store i1 [[C18]], ptr [[P]], align 1
380435
; CHECK-NEXT: store ptr [[G26]], ptr [[P]], align 8
381436
; CHECK-NEXT: ret void
382437
;
383-
%A8 = alloca i16
384438
%L7 = load i16, ptr %A8
385439
%G21 = getelementptr i16, ptr %A8, i8 -1
386440
%B11 = udiv i16 %L7, -1

llvm/test/Transforms/InstCombine/apint-shift.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -562,11 +562,10 @@ define i40 @test26(i40 %A) {
562562

563563
; OSS-Fuzz #9880
564564
; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9880
565-
define i177 @ossfuzz_9880(i177 %X) {
565+
define i177 @ossfuzz_9880(i177 %X, ptr %A) {
566566
; CHECK-LABEL: @ossfuzz_9880(
567567
; CHECK-NEXT: ret i177 0
568568
;
569-
%A = alloca i177
570569
%L1 = load i177, ptr %A
571570
%B = or i177 0, -1
572571
%B5 = udiv i177 %L1, %B

llvm/test/Transforms/InstCombine/call-cast-target.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -110,19 +110,17 @@ entry:
110110

111111
declare i1 @fn5(ptr byval({ i32, i32 }) align 4 %r)
112112

113-
define i1 @test5() {
114-
; CHECK-LABEL: define i1 @test5() {
115-
; CHECK-NEXT: [[TMP1:%.*]] = alloca { i32, i32 }, align 4
116-
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
117-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 4
113+
define i1 @test5(ptr %ptr) {
114+
; CHECK-LABEL: define i1 @test5(ptr %ptr) {
115+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[PTR:%.*]], align 4
116+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i32 4
118117
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
119118
; CHECK-NEXT: [[TMP5:%.*]] = call i1 @fn5(i32 [[TMP2]], i32 [[TMP4]])
120119
; CHECK-NEXT: ret i1 [[TMP5]]
121120
;
122-
%1 = alloca { i32, i32 }, align 4
123-
%2 = getelementptr inbounds { i32, i32 }, ptr %1, i32 0, i32 0
121+
%2 = getelementptr inbounds { i32, i32 }, ptr %ptr, i32 0, i32 0
124122
%3 = load i32, ptr %2, align 4
125-
%4 = getelementptr inbounds { i32, i32 }, ptr %1, i32 0, i32 1
123+
%4 = getelementptr inbounds { i32, i32 }, ptr %ptr, i32 0, i32 1
126124
%5 = load i32, ptr %4, align 4
127125
%6 = call i1 @fn5(i32 %3, i32 %5)
128126
ret i1 %6

0 commit comments

Comments
 (0)