Skip to content

Commit f3da020

Browse files
committed
AtomicExpand: Stop precollecting atomic instructions in function
Move the processing of an instruction into a helper function. Also avoid redundant checking for all types of atomic instructions. Including the assert, it was effectively performing the same check 3 times.
1 parent 49777d7 commit f3da020

File tree

6 files changed

+236
-225
lines changed

6 files changed

+236
-225
lines changed

llvm/lib/CodeGen/AtomicExpandPass.cpp

Lines changed: 140 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ class AtomicExpandImpl {
119119
llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
120120
CreateCmpXchgInstFun CreateCmpXchg);
121121

122+
bool processAtomicInstr(Instruction *I);
123+
122124
public:
123125
bool run(Function &F, const TargetMachine *TM);
124126
};
@@ -203,149 +205,158 @@ static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
203205
Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
204206
}
205207

206-
bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
207-
const auto *Subtarget = TM->getSubtargetImpl(F);
208-
if (!Subtarget->enableAtomicExpand())
209-
return false;
210-
TLI = Subtarget->getTargetLowering();
211-
DL = &F.getDataLayout();
208+
bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
209+
auto *LI = dyn_cast<LoadInst>(I);
210+
auto *SI = dyn_cast<StoreInst>(I);
211+
auto *RMWI = dyn_cast<AtomicRMWInst>(I);
212+
auto *CASI = dyn_cast<AtomicCmpXchgInst>(I);
212213

213-
SmallVector<Instruction *, 1> AtomicInsts;
214+
// If the Size/Alignment is not supported, replace with a libcall.
215+
if (LI) {
216+
if (!LI->isAtomic())
217+
return false;
214218

215-
// Changing control-flow while iterating through it is a bad idea, so gather a
216-
// list of all atomic instructions before we start.
217-
for (Instruction &I : instructions(F))
218-
if (I.isAtomic() && !isa<FenceInst>(&I))
219-
AtomicInsts.push_back(&I);
219+
if (!atomicSizeSupported(TLI, LI)) {
220+
expandAtomicLoadToLibcall(LI);
221+
return true;
222+
}
223+
} else if (SI) {
224+
if (!SI->isAtomic())
225+
return false;
220226

221-
bool MadeChange = false;
222-
for (auto *I : AtomicInsts) {
223-
auto LI = dyn_cast<LoadInst>(I);
224-
auto SI = dyn_cast<StoreInst>(I);
225-
auto RMWI = dyn_cast<AtomicRMWInst>(I);
226-
auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
227-
assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
228-
229-
// If the Size/Alignment is not supported, replace with a libcall.
230-
if (LI) {
231-
if (!atomicSizeSupported(TLI, LI)) {
232-
expandAtomicLoadToLibcall(LI);
233-
MadeChange = true;
234-
continue;
235-
}
236-
} else if (SI) {
237-
if (!atomicSizeSupported(TLI, SI)) {
238-
expandAtomicStoreToLibcall(SI);
239-
MadeChange = true;
240-
continue;
241-
}
242-
} else if (RMWI) {
243-
if (!atomicSizeSupported(TLI, RMWI)) {
244-
expandAtomicRMWToLibcall(RMWI);
245-
MadeChange = true;
246-
continue;
247-
}
248-
} else if (CASI) {
249-
if (!atomicSizeSupported(TLI, CASI)) {
250-
expandAtomicCASToLibcall(CASI);
251-
MadeChange = true;
252-
continue;
253-
}
227+
if (!atomicSizeSupported(TLI, SI)) {
228+
expandAtomicStoreToLibcall(SI);
229+
return true;
230+
}
231+
} else if (RMWI) {
232+
if (!atomicSizeSupported(TLI, RMWI)) {
233+
expandAtomicRMWToLibcall(RMWI);
234+
return true;
254235
}
236+
} else if (CASI) {
237+
if (!atomicSizeSupported(TLI, CASI)) {
238+
expandAtomicCASToLibcall(CASI);
239+
return true;
240+
}
241+
} else
242+
return false;
255243

256-
if (LI && TLI->shouldCastAtomicLoadInIR(LI) ==
257-
TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
258-
I = LI = convertAtomicLoadToIntegerType(LI);
259-
MadeChange = true;
260-
} else if (SI &&
261-
TLI->shouldCastAtomicStoreInIR(SI) ==
262-
TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
263-
I = SI = convertAtomicStoreToIntegerType(SI);
244+
bool MadeChange = false;
245+
246+
if (LI && TLI->shouldCastAtomicLoadInIR(LI) ==
247+
TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
248+
I = LI = convertAtomicLoadToIntegerType(LI);
249+
MadeChange = true;
250+
} else if (SI && TLI->shouldCastAtomicStoreInIR(SI) ==
251+
TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
252+
I = SI = convertAtomicStoreToIntegerType(SI);
253+
MadeChange = true;
254+
} else if (RMWI &&
255+
TLI->shouldCastAtomicRMWIInIR(RMWI) ==
256+
TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
257+
I = RMWI = convertAtomicXchgToIntegerType(RMWI);
258+
MadeChange = true;
259+
} else if (CASI) {
260+
// TODO: when we're ready to make the change at the IR level, we can
261+
// extend convertCmpXchgToInteger for floating point too.
262+
if (CASI->getCompareOperand()->getType()->isPointerTy()) {
263+
// TODO: add a TLI hook to control this so that each target can
264+
// convert to lowering the original type one at a time.
265+
I = CASI = convertCmpXchgToIntegerType(CASI);
264266
MadeChange = true;
265-
} else if (RMWI &&
266-
TLI->shouldCastAtomicRMWIInIR(RMWI) ==
267-
TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
268-
I = RMWI = convertAtomicXchgToIntegerType(RMWI);
267+
}
268+
}
269+
270+
if (TLI->shouldInsertFencesForAtomic(I)) {
271+
auto FenceOrdering = AtomicOrdering::Monotonic;
272+
if (LI && isAcquireOrStronger(LI->getOrdering())) {
273+
FenceOrdering = LI->getOrdering();
274+
LI->setOrdering(AtomicOrdering::Monotonic);
275+
} else if (SI && isReleaseOrStronger(SI->getOrdering())) {
276+
FenceOrdering = SI->getOrdering();
277+
SI->setOrdering(AtomicOrdering::Monotonic);
278+
} else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
279+
isAcquireOrStronger(RMWI->getOrdering()))) {
280+
FenceOrdering = RMWI->getOrdering();
281+
RMWI->setOrdering(AtomicOrdering::Monotonic);
282+
} else if (CASI &&
283+
TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
284+
TargetLoweringBase::AtomicExpansionKind::None &&
285+
(isReleaseOrStronger(CASI->getSuccessOrdering()) ||
286+
isAcquireOrStronger(CASI->getSuccessOrdering()) ||
287+
isAcquireOrStronger(CASI->getFailureOrdering()))) {
288+
// If a compare and swap is lowered to LL/SC, we can do smarter fence
289+
// insertion, with a stronger one on the success path than on the
290+
// failure path. As a result, fence insertion is directly done by
291+
// expandAtomicCmpXchg in that case.
292+
FenceOrdering = CASI->getMergedOrdering();
293+
CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
294+
CASI->setFailureOrdering(AtomicOrdering::Monotonic);
295+
}
296+
297+
if (FenceOrdering != AtomicOrdering::Monotonic) {
298+
MadeChange |= bracketInstWithFences(I, FenceOrdering);
299+
}
300+
} else if (I->hasAtomicStore() &&
301+
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
302+
auto FenceOrdering = AtomicOrdering::Monotonic;
303+
if (SI)
304+
FenceOrdering = SI->getOrdering();
305+
else if (RMWI)
306+
FenceOrdering = RMWI->getOrdering();
307+
else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) !=
308+
TargetLoweringBase::AtomicExpansionKind::LLSC)
309+
// LLSC is handled in expandAtomicCmpXchg().
310+
FenceOrdering = CASI->getSuccessOrdering();
311+
312+
IRBuilder Builder(I);
313+
if (auto TrailingFence =
314+
TLI->emitTrailingFence(Builder, I, FenceOrdering)) {
315+
TrailingFence->moveAfter(I);
269316
MadeChange = true;
270-
} else if (CASI) {
271-
// TODO: when we're ready to make the change at the IR level, we can
272-
// extend convertCmpXchgToInteger for floating point too.
273-
if (CASI->getCompareOperand()->getType()->isPointerTy()) {
274-
// TODO: add a TLI hook to control this so that each target can
275-
// convert to lowering the original type one at a time.
276-
I = CASI = convertCmpXchgToIntegerType(CASI);
277-
MadeChange = true;
278-
}
279317
}
318+
}
280319

281-
if (TLI->shouldInsertFencesForAtomic(I)) {
282-
auto FenceOrdering = AtomicOrdering::Monotonic;
283-
if (LI && isAcquireOrStronger(LI->getOrdering())) {
284-
FenceOrdering = LI->getOrdering();
285-
LI->setOrdering(AtomicOrdering::Monotonic);
286-
} else if (SI && isReleaseOrStronger(SI->getOrdering())) {
287-
FenceOrdering = SI->getOrdering();
288-
SI->setOrdering(AtomicOrdering::Monotonic);
289-
} else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
290-
isAcquireOrStronger(RMWI->getOrdering()))) {
291-
FenceOrdering = RMWI->getOrdering();
292-
RMWI->setOrdering(AtomicOrdering::Monotonic);
293-
} else if (CASI &&
294-
TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
295-
TargetLoweringBase::AtomicExpansionKind::None &&
296-
(isReleaseOrStronger(CASI->getSuccessOrdering()) ||
297-
isAcquireOrStronger(CASI->getSuccessOrdering()) ||
298-
isAcquireOrStronger(CASI->getFailureOrdering()))) {
299-
// If a compare and swap is lowered to LL/SC, we can do smarter fence
300-
// insertion, with a stronger one on the success path than on the
301-
// failure path. As a result, fence insertion is directly done by
302-
// expandAtomicCmpXchg in that case.
303-
FenceOrdering = CASI->getMergedOrdering();
304-
CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
305-
CASI->setFailureOrdering(AtomicOrdering::Monotonic);
306-
}
320+
if (LI)
321+
MadeChange |= tryExpandAtomicLoad(LI);
322+
else if (SI)
323+
MadeChange |= tryExpandAtomicStore(SI);
324+
else if (RMWI) {
325+
// There are two different ways of expanding RMW instructions:
326+
// - into a load if it is idempotent
327+
// - into a Cmpxchg/LL-SC loop otherwise
328+
// we try them in that order.
329+
330+
if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
331+
MadeChange = true;
307332

308-
if (FenceOrdering != AtomicOrdering::Monotonic) {
309-
MadeChange |= bracketInstWithFences(I, FenceOrdering);
310-
}
311-
} else if (I->hasAtomicStore() &&
312-
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
313-
auto FenceOrdering = AtomicOrdering::Monotonic;
314-
if (SI)
315-
FenceOrdering = SI->getOrdering();
316-
else if (RMWI)
317-
FenceOrdering = RMWI->getOrdering();
318-
else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) !=
319-
TargetLoweringBase::AtomicExpansionKind::LLSC)
320-
// LLSC is handled in expandAtomicCmpXchg().
321-
FenceOrdering = CASI->getSuccessOrdering();
322-
323-
IRBuilder Builder(I);
324-
if (auto TrailingFence =
325-
TLI->emitTrailingFence(Builder, I, FenceOrdering)) {
326-
TrailingFence->moveAfter(I);
327-
MadeChange = true;
328-
}
333+
} else {
334+
MadeChange |= tryExpandAtomicRMW(RMWI);
329335
}
336+
} else if (CASI)
337+
MadeChange |= tryExpandAtomicCmpXchg(CASI);
330338

331-
if (LI)
332-
MadeChange |= tryExpandAtomicLoad(LI);
333-
else if (SI)
334-
MadeChange |= tryExpandAtomicStore(SI);
335-
else if (RMWI) {
336-
// There are two different ways of expanding RMW instructions:
337-
// - into a load if it is idempotent
338-
// - into a Cmpxchg/LL-SC loop otherwise
339-
// we try them in that order.
340-
341-
if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
339+
return MadeChange;
340+
}
341+
342+
bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
343+
const auto *Subtarget = TM->getSubtargetImpl(F);
344+
if (!Subtarget->enableAtomicExpand())
345+
return false;
346+
TLI = Subtarget->getTargetLowering();
347+
DL = &F.getDataLayout();
348+
349+
bool MadeChange = false;
350+
351+
for (BasicBlock &BB : make_early_inc_range(F)) {
352+
for (Instruction &I : make_early_inc_range(reverse(BB))) {
353+
// We do this iteration backwards because the control flow introducing
354+
// transforms split the block at the end.
355+
if (processAtomicInstr(&I))
342356
MadeChange = true;
343-
} else {
344-
MadeChange |= tryExpandAtomicRMW(RMWI);
345-
}
346-
} else if (CASI)
347-
MadeChange |= tryExpandAtomicCmpXchg(CASI);
357+
}
348358
}
359+
349360
return MadeChange;
350361
}
351362

llvm/test/CodeGen/NVPTX/atomics-sm70.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
6161
; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2;
6262
; CHECKPTX62-NEXT: not.b32 %r3, %r27;
6363
; CHECKPTX62-NEXT: ld.u32 %r54, [%r1];
64-
; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start
64+
; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45
6565
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
6666
; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2;
6767
; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r28;
@@ -74,9 +74,9 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
7474
; CHECKPTX62-NEXT: setp.ne.s32 %p1, %r6, %r54;
7575
; CHECKPTX62-NEXT: mov.u32 %r54, %r6;
7676
; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1;
77-
; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end
77+
; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44
7878
; CHECKPTX62-NEXT: ld.u32 %r55, [%r1];
79-
; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start9
79+
; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27
8080
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
8181
; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2;
8282
; CHECKPTX62-NEXT: cvt.u16.u32 %rs6, %r33;
@@ -90,14 +90,14 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
9090
; CHECKPTX62-NEXT: setp.ne.s32 %p2, %r9, %r55;
9191
; CHECKPTX62-NEXT: mov.u32 %r55, %r9;
9292
; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3;
93-
; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end8
93+
; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26
9494
; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4;
9595
; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3;
9696
; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24;
9797
; CHECKPTX62-NEXT: shl.b32 %r40, %r26, %r11;
9898
; CHECKPTX62-NEXT: not.b32 %r12, %r40;
9999
; CHECKPTX62-NEXT: ld.global.u32 %r56, [%r10];
100-
; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start27
100+
; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9
101101
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
102102
; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11;
103103
; CHECKPTX62-NEXT: cvt.u16.u32 %rs11, %r41;
@@ -110,14 +110,14 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
110110
; CHECKPTX62-NEXT: setp.ne.s32 %p3, %r15, %r56;
111111
; CHECKPTX62-NEXT: mov.u32 %r56, %r15;
112112
; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5;
113-
; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end26
113+
; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8
114114
; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4;
115115
; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3;
116116
; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24;
117117
; CHECKPTX62-NEXT: shl.b32 %r48, %r26, %r17;
118118
; CHECKPTX62-NEXT: not.b32 %r18, %r48;
119119
; CHECKPTX62-NEXT: ld.shared.u32 %r57, [%r16];
120-
; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start45
120+
; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start
121121
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
122122
; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17;
123123
; CHECKPTX62-NEXT: cvt.u16.u32 %rs15, %r49;
@@ -130,7 +130,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
130130
; CHECKPTX62-NEXT: setp.ne.s32 %p4, %r21, %r57;
131131
; CHECKPTX62-NEXT: mov.u32 %r57, %r21;
132132
; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7;
133-
; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end44
133+
; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end
134134
; CHECKPTX62-NEXT: ret;
135135
%r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
136136
%r2 = atomicrmw fadd ptr %dp0, half 1.0 seq_cst

0 commit comments

Comments
 (0)