Skip to content

Commit 2f497ec

Browse files
committed
[ARM] Fix ARM backend to correctly use atomic expansion routines.
Without this patch, clang would generate calls to __sync_* routines on targets where it does not make sense; we can't assume the routines exist on unknown targets. Linux has special implementations of the routines that work on old ARM targets; other targets have no such routines. In general, atomics operations which aren't natively supported should go through libatomic (__atomic_*) APIs, which can support arbitrary atomics through locks. ARM targets older than v6, where this patch makes a difference, are rare in practice, but not completely extinct. See, for example, discussion on D116088. This also affects Cortex-M0, but I don't think __sync_* routines actually exist in any Cortex-M0 libraries. So in practice this just leads to a slightly different linker error for those cases, I think. Mechanically, this patch does the following: - Ensures we run atomic expansion unconditionally; it never makes sense to completely skip it. - Fixes getMaxAtomicSizeInBitsSupported() so it returns an appropriate number on all ARM subtargets. - Fixes shouldExpandAtomicRMWInIR() and shouldExpandAtomicCmpXchgInIR() to correctly handle subtargets that don't have atomic instructions. Differential Revision: https://reviews.llvm.org/D120026
1 parent 3ac84c4 commit 2f497ec

File tree

8 files changed

+104
-79
lines changed

8 files changed

+104
-79
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,6 +1369,29 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
13691369
}
13701370
}
13711371

1372+
// Compute supported atomic widths.
1373+
if (Subtarget->isTargetLinux() ||
1374+
(!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1375+
// For targets where __sync_* routines are reliably available, we use them
1376+
// if necessary.
1377+
//
1378+
// ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1379+
// routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1380+
//
1381+
// ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1382+
// such targets should provide __sync_* routines, which use the ARM mode
1383+
// instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1384+
// encoding; see ARMISD::MEMBARRIER_MCR.)
1385+
setMaxAtomicSizeInBitsSupported(64);
1386+
} else if (Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) {
1387+
// Cortex-M (besides Cortex-M0) have 32-bit atomics.
1388+
setMaxAtomicSizeInBitsSupported(32);
1389+
} else {
1390+
// We can't assume anything about other targets; just use libatomic
1391+
// routines.
1392+
setMaxAtomicSizeInBitsSupported(0);
1393+
}
1394+
13721395
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
13731396

13741397
// Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
@@ -20978,19 +21001,25 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
2097821001
if (AI->isFloatingPointOperation())
2097921002
return AtomicExpansionKind::CmpXChg;
2098021003

20981-
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
20982-
// implement atomicrmw without spilling. If the target address is also on the
20983-
// stack and close enough to the spill slot, this can lead to a situation
20984-
// where the monitor always gets cleared and the atomic operation can never
20985-
// succeed. So at -O0 lower this operation to a CAS loop.
20986-
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
20987-
return AtomicExpansionKind::CmpXChg;
20988-
2098921004
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
20990-
bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
20991-
return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
20992-
? AtomicExpansionKind::LLSC
20993-
: AtomicExpansionKind::None;
21005+
bool hasAtomicRMW;
21006+
if (Subtarget->isMClass())
21007+
hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21008+
else if (Subtarget->isThumb())
21009+
hasAtomicRMW = Subtarget->hasV7Ops();
21010+
else
21011+
hasAtomicRMW = Subtarget->hasV6Ops();
21012+
if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21013+
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
21014+
// implement atomicrmw without spilling. If the target address is also on
21015+
// the stack and close enough to the spill slot, this can lead to a
21016+
// situation where the monitor always gets cleared and the atomic operation
21017+
// can never succeed. So at -O0 lower this operation to a CAS loop.
21018+
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
21019+
return AtomicExpansionKind::CmpXChg;
21020+
return AtomicExpansionKind::LLSC;
21021+
}
21022+
return AtomicExpansionKind::None;
2099421023
}
2099521024

2099621025
// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
@@ -21003,8 +21032,13 @@ ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
2100321032
// situation where the monitor always gets cleared and the atomic operation
2100421033
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
2100521034
unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21006-
bool HasAtomicCmpXchg =
21007-
!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
21035+
bool HasAtomicCmpXchg;
21036+
if (Subtarget->isMClass())
21037+
HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21038+
else if (Subtarget->isThumb())
21039+
HasAtomicCmpXchg = Subtarget->hasV7Ops();
21040+
else
21041+
HasAtomicCmpXchg = Subtarget->hasV6Ops();
2100821042
if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
2100921043
Size <= (Subtarget->isMClass() ? 32U : 64U))
2101021044
return AtomicExpansionKind::LLSC;

llvm/lib/Target/ARM/ARMSubtarget.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -411,8 +411,6 @@ bool ARMSubtarget::enablePostRAMachineScheduler() const {
411411
return !isThumb1Only();
412412
}
413413

414-
bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier(); }
415-
416414
bool ARMSubtarget::useStride4VFPs() const {
417415
// For general targets, the prologue can grow when VFPs are allocated with
418416
// stride 4 (more vpush instructions). But WatchOS uses a compact unwind

llvm/lib/Target/ARM/ARMSubtarget.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -478,9 +478,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
478478
/// scheduling, DAGCombine, etc.).
479479
bool useAA() const override { return true; }
480480

481-
// enableAtomicExpand- True if we need to expand our atomics.
482-
bool enableAtomicExpand() const override;
483-
484481
/// getInstrItins - Return the instruction itineraries based on subtarget
485482
/// selection.
486483
const InstrItineraryData *getInstrItineraryData() const override {

llvm/test/CodeGen/ARM/atomic-64bit.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ define i64 @test1(i64* %ptr, i64 %val) {
3030
; CHECK-THUMB: bne
3131
; CHECK-THUMB: dmb {{ish$}}
3232

33-
; CHECK-M: __sync_fetch_and_add_8
33+
; CHECK-M: __atomic_fetch_add_8
3434

3535
%r = atomicrmw add i64* %ptr, i64 %val seq_cst
3636
ret i64 %r
@@ -61,7 +61,7 @@ define i64 @test2(i64* %ptr, i64 %val) {
6161
; CHECK-THUMB: bne
6262
; CHECK-THUMB: dmb {{ish$}}
6363

64-
; CHECK-M: __sync_fetch_and_sub_8
64+
; CHECK-M: __atomic_fetch_sub_8
6565

6666
%r = atomicrmw sub i64* %ptr, i64 %val seq_cst
6767
ret i64 %r
@@ -92,7 +92,7 @@ define i64 @test3(i64* %ptr, i64 %val) {
9292
; CHECK-THUMB: bne
9393
; CHECK-THUMB: dmb {{ish$}}
9494

95-
; CHECK-M: __sync_fetch_and_and_8
95+
; CHECK-M: _atomic_fetch_and_8
9696

9797
%r = atomicrmw and i64* %ptr, i64 %val seq_cst
9898
ret i64 %r
@@ -123,7 +123,7 @@ define i64 @test4(i64* %ptr, i64 %val) {
123123
; CHECK-THUMB: bne
124124
; CHECK-THUMB: dmb {{ish$}}
125125

126-
; CHECK-M: __sync_fetch_and_or_8
126+
; CHECK-M: __atomic_fetch_or_8
127127

128128
%r = atomicrmw or i64* %ptr, i64 %val seq_cst
129129
ret i64 %r
@@ -154,7 +154,7 @@ define i64 @test5(i64* %ptr, i64 %val) {
154154
; CHECK-THUMB: bne
155155
; CHECK-THUMB: dmb {{ish$}}
156156

157-
; CHECK-M: __sync_fetch_and_xor_8
157+
; CHECK-M: __atomic_fetch_xor_8
158158

159159
%r = atomicrmw xor i64* %ptr, i64 %val seq_cst
160160
ret i64 %r
@@ -177,7 +177,7 @@ define i64 @test6(i64* %ptr, i64 %val) {
177177
; CHECK-THUMB: bne
178178
; CHECK-THUMB: dmb {{ish$}}
179179

180-
; CHECK-M: __sync_lock_test_and_set_8
180+
; CHECK-M: __atomic_exchange_8
181181

182182
%r = atomicrmw xchg i64* %ptr, i64 %val seq_cst
183183
ret i64 %r
@@ -213,7 +213,7 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
213213
; CHECK-THUMB: beq
214214
; CHECK-THUMB: dmb {{ish$}}
215215

216-
; CHECK-M: __sync_val_compare_and_swap_8
216+
; CHECK-M: __atomic_compare_exchange_8
217217

218218
%pair = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
219219
%r = extractvalue { i64, i1 } %pair, 0
@@ -237,7 +237,7 @@ define i64 @test8(i64* %ptr) {
237237
; CHECK-THUMB-NOT: strexd
238238
; CHECK-THUMB: dmb {{ish$}}
239239

240-
; CHECK-M: __sync_val_compare_and_swap_8
240+
; CHECK-M: __atomic_load_8
241241

242242
%r = load atomic i64, i64* %ptr seq_cst, align 8
243243
ret i64 %r
@@ -263,7 +263,7 @@ define void @test9(i64* %ptr, i64 %val) {
263263
; CHECK-THUMB: bne
264264
; CHECK-THUMB: dmb {{ish$}}
265265

266-
; CHECK-M: __sync_lock_test_and_set_8
266+
; CHECK-M: __atomic_store_8
267267

268268
store atomic i64 %val, i64* %ptr seq_cst, align 8
269269
ret void
@@ -308,7 +308,7 @@ define i64 @test10(i64* %ptr, i64 %val) {
308308
; CHECK-THUMB: bne
309309
; CHECK-THUMB: dmb {{ish$}}
310310

311-
; CHECK-M: __sync_fetch_and_min_8
311+
; CHECK-M: __atomic_compare_exchange_8
312312

313313
%r = atomicrmw min i64* %ptr, i64 %val seq_cst
314314
ret i64 %r
@@ -353,7 +353,7 @@ define i64 @test11(i64* %ptr, i64 %val) {
353353
; CHECK-THUMB: bne
354354
; CHECK-THUMB: dmb {{ish$}}
355355

356-
; CHECK-M: __sync_fetch_and_umin_8
356+
; CHECK-M: __atomic_compare_exchange_8
357357

358358
%r = atomicrmw umin i64* %ptr, i64 %val seq_cst
359359
ret i64 %r
@@ -398,7 +398,7 @@ define i64 @test12(i64* %ptr, i64 %val) {
398398
; CHECK-THUMB: bne
399399
; CHECK-THUMB: dmb {{ish$}}
400400

401-
; CHECK-M: __sync_fetch_and_max_8
401+
; CHECK-M: __atomic_compare_exchange_8
402402

403403
%r = atomicrmw max i64* %ptr, i64 %val seq_cst
404404
ret i64 %r
@@ -443,7 +443,7 @@ define i64 @test13(i64* %ptr, i64 %val) {
443443
; CHECK-THUMB: bne
444444
; CHECK-THUMB: dmb {{ish$}}
445445

446-
; CHECK-M: __sync_fetch_and_umax_8
446+
; CHECK-M: __atomic_compare_exchange_8
447447

448448
%r = atomicrmw umax i64* %ptr, i64 %val seq_cst
449449
ret i64 %r

llvm/test/CodeGen/ARM/atomic-load-store.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,14 @@ define void @test4(i8* %ptr1, i8* %ptr2) {
9494

9595
define i64 @test_old_load_64bit(i64* %p) {
9696
; ARMV4-LABEL: test_old_load_64bit
97-
; ARMV4: ___sync_val_compare_and_swap_8
97+
; ARMV4: ___atomic_load_8
9898
%1 = load atomic i64, i64* %p seq_cst, align 8
9999
ret i64 %1
100100
}
101101

102102
define void @test_old_store_64bit(i64* %p, i64 %v) {
103103
; ARMV4-LABEL: test_old_store_64bit
104-
; ARMV4: ___sync_lock_test_and_set_8
104+
; ARMV4: ___atomic_store_8
105105
store atomic i64 %v, i64* %p seq_cst, align 8
106106
ret void
107107
}

0 commit comments

Comments
 (0)