Skip to content

Commit bd588df

Browse files
committed
[X86] Improve lowering of idemptotent RMW operations
The current lowering uses an mfence. mfences are substaintially higher latency than the locked operations originally requested, but we do want to avoid contention on the original cache line. As such, use a locked instruction on a cache line assumed to be thread local. Differential Revision: https://reviews.llvm.org/D58632 llvm-svn: 360393
1 parent 1129678 commit bd588df

File tree

2 files changed

+111
-66
lines changed

2 files changed

+111
-66
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25727,6 +25727,14 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
2572725727
if (MemType->getPrimitiveSizeInBits() > NativeWidth)
2572825728
return nullptr;
2572925729

25730+
// If this is a canonical idempotent atomicrmw w/no uses, we have a better
25731+
// lowering available in lowerAtomicArith.
25732+
// TODO: push more cases through this path.
25733+
if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
25734+
if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
25735+
AI->use_empty())
25736+
return nullptr;
25737+
2573025738
auto Builder = IRBuilder<>(AI);
2573125739
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
2573225740
auto SSID = AI->getSyncScopeID();
@@ -26223,6 +26231,59 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
2622326231
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
2622426232
}
2622526233

26234+
/// Emit a locked operation on a stack location which does not change any
26235+
/// memory location, but does involve a lock prefix. Location is chosen to be
26236+
/// a) very likely accessed only by a single thread to minimize cache traffic,
26237+
/// and b) definitely dereferenceable. Returns the new Chain result.
26238+
static SDValue emitLockedStackOp(SelectionDAG &DAG,
26239+
const X86Subtarget &Subtarget,
26240+
SDValue Chain, SDLoc DL) {
26241+
// Implementation notes:
26242+
// 1) LOCK prefix creates a full read/write reordering barrier for memory
26243+
// operations issued by the current processor. As such, the location
26244+
// referenced is not relevant for the ordering properties of the instruction.
26245+
// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
26246+
// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
26247+
// 2) Using an immediate operand appears to be the best encoding choice
26248+
// here since it doesn't require an extra register.
26249+
// 3) OR appears to be very slightly faster than ADD. (Though, the difference
26250+
// is small enough it might just be measurement noise.)
26251+
// 4) For the moment, we are using top of stack. This creates false sharing
26252+
// with actual stack access/call sequences, and it would be better to use a
26253+
// location within the redzone. For the moment, this is still better than an
26254+
// mfence though. TODO: Revise the offset used when we can assume a redzone.
26255+
//
26256+
// For a general discussion of the tradeoffs and benchmark results, see:
26257+
// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
26258+
26259+
if (Subtarget.is64Bit()) {
26260+
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i8);
26261+
SDValue Ops[] = {
26262+
DAG.getRegister(X86::RSP, MVT::i64), // Base
26263+
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
26264+
DAG.getRegister(0, MVT::i64), // Index
26265+
DAG.getTargetConstant(0, DL, MVT::i32), // Disp
26266+
DAG.getRegister(0, MVT::i32), // Segment.
26267+
Zero,
26268+
Chain};
26269+
SDNode *Res = DAG.getMachineNode(X86::LOCK_OR32mi8, DL, MVT::Other, Ops);
26270+
return SDValue(Res, 0);
26271+
}
26272+
26273+
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
26274+
SDValue Ops[] = {
26275+
DAG.getRegister(X86::ESP, MVT::i32), // Base
26276+
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
26277+
DAG.getRegister(0, MVT::i32), // Index
26278+
DAG.getTargetConstant(0, DL, MVT::i32), // Disp
26279+
DAG.getRegister(0, MVT::i32), // Segment.
26280+
Zero,
26281+
Chain
26282+
};
26283+
SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::Other, Ops);
26284+
return SDValue(Res, 0);
26285+
}
26286+
2622626287
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
2622726288
const X86Subtarget &Subtarget) {
2622826289
unsigned NewOpc = 0;
@@ -26257,6 +26318,7 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
2625726318
/// Lower atomic_load_ops into LOCK-prefixed operations.
2625826319
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
2625926320
const X86Subtarget &Subtarget) {
26321+
AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
2626026322
SDValue Chain = N->getOperand(0);
2626126323
SDValue LHS = N->getOperand(1);
2626226324
SDValue RHS = N->getOperand(2);
@@ -26271,7 +26333,6 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
2627126333
// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
2627226334
// select LXADD if LOCK_SUB can't be selected.
2627326335
if (Opc == ISD::ATOMIC_LOAD_SUB) {
26274-
AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
2627526336
RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
2627626337
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
2627726338
RHS, AN->getMemOperand());
@@ -26281,6 +26342,32 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
2628126342
return N;
2628226343
}
2628326344

26345+
// Specialized lowering for the canonical form of an idemptotent atomicrmw.
26346+
// The core idea here is that since the memory location isn't actually
26347+
// changing, all we need is a lowering for the *ordering* impacts of the
26348+
// atomicrmw. As such, we can chose a different operation and memory
26349+
// location to minimize impact on other code.
26350+
if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
26351+
// On X86, the only ordering which actually requires an instruction is
26352+
// seq_cst which isn't SingleThread, everything just needs to be preserved
26353+
// during codegen and then dropped. Note that we expect (but don't assume),
26354+
// that orderings other than seq_cst and acq_rel have been canonicalized to
26355+
// a store or load.
26356+
if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
26357+
AN->getSyncScopeID() == SyncScope::System) {
26358+
// Prefer a locked operation against a stack location to minimize cache
26359+
// traffic. This assumes that stack locations are very likely to be
26360+
// accessed only by the owning thread.
26361+
SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
26362+
DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain);
26363+
return SDValue();
26364+
}
26365+
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
26366+
SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
26367+
DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain);
26368+
return SDValue();
26369+
}
26370+
2628426371
SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
2628526372
// RAUW the chain, but don't worry about the result, as it's unused.
2628626373
assert(!N->hasAnyUseOfValue(0));

llvm/test/CodeGen/X86/atomic-idempotent.ll

Lines changed: 23 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -166,86 +166,51 @@ define i32 @and32 (i32* %p) {
166166
}
167167

168168
define void @or32_nouse_monotonic(i32* %p) {
169-
; X64-LABEL: or32_nouse_monotonic:
170-
; X64: # %bb.0:
171-
; X64-NEXT: mfence
172-
; X64-NEXT: movl (%rdi), %eax
173-
; X64-NEXT: retq
174-
;
175-
; X32-LABEL: or32_nouse_monotonic:
176-
; X32: # %bb.0:
177-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
178-
; X32-NEXT: mfence
179-
; X32-NEXT: movl (%eax), %eax
180-
; X32-NEXT: retl
169+
; CHECK-LABEL: or32_nouse_monotonic:
170+
; CHECK: # %bb.0:
171+
; CHECK-NEXT: #MEMBARRIER
172+
; CHECK-NEXT: ret{{[l|q]}}
181173
atomicrmw or i32* %p, i32 0 monotonic
182174
ret void
183175
}
184176

185177

186178
define void @or32_nouse_acquire(i32* %p) {
187-
; X64-LABEL: or32_nouse_acquire:
188-
; X64: # %bb.0:
189-
; X64-NEXT: mfence
190-
; X64-NEXT: movl (%rdi), %eax
191-
; X64-NEXT: retq
192-
;
193-
; X32-LABEL: or32_nouse_acquire:
194-
; X32: # %bb.0:
195-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
196-
; X32-NEXT: mfence
197-
; X32-NEXT: movl (%eax), %eax
198-
; X32-NEXT: retl
179+
; CHECK-LABEL: or32_nouse_acquire:
180+
; CHECK: # %bb.0:
181+
; CHECK-NEXT: #MEMBARRIER
182+
; CHECK-NEXT: ret{{[l|q]}}
199183
atomicrmw or i32* %p, i32 0 acquire
200184
ret void
201185
}
202186

203187
define void @or32_nouse_release(i32* %p) {
204-
; X64-LABEL: or32_nouse_release:
205-
; X64: # %bb.0:
206-
; X64-NEXT: mfence
207-
; X64-NEXT: movl (%rdi), %eax
208-
; X64-NEXT: retq
209-
;
210-
; X32-LABEL: or32_nouse_release:
211-
; X32: # %bb.0:
212-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
213-
; X32-NEXT: mfence
214-
; X32-NEXT: movl (%eax), %eax
215-
; X32-NEXT: retl
188+
; CHECK-LABEL: or32_nouse_release:
189+
; CHECK: # %bb.0:
190+
; CHECK-NEXT: #MEMBARRIER
191+
; CHECK-NEXT: ret{{[l|q]}}
216192
atomicrmw or i32* %p, i32 0 release
217193
ret void
218194
}
219195

220196
define void @or32_nouse_acq_rel(i32* %p) {
221-
; X64-LABEL: or32_nouse_acq_rel:
222-
; X64: # %bb.0:
223-
; X64-NEXT: mfence
224-
; X64-NEXT: movl (%rdi), %eax
225-
; X64-NEXT: retq
226-
;
227-
; X32-LABEL: or32_nouse_acq_rel:
228-
; X32: # %bb.0:
229-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
230-
; X32-NEXT: mfence
231-
; X32-NEXT: movl (%eax), %eax
232-
; X32-NEXT: retl
197+
; CHECK-LABEL: or32_nouse_acq_rel:
198+
; CHECK: # %bb.0:
199+
; CHECK-NEXT: #MEMBARRIER
200+
; CHECK-NEXT: ret{{[l|q]}}
233201
atomicrmw or i32* %p, i32 0 acq_rel
234202
ret void
235203
}
236204

237205
define void @or32_nouse_seq_cst(i32* %p) {
238206
; X64-LABEL: or32_nouse_seq_cst:
239207
; X64: # %bb.0:
240-
; X64-NEXT: mfence
241-
; X64-NEXT: movl (%rdi), %eax
208+
; X64-NEXT: lock orl $0, (%rsp)
242209
; X64-NEXT: retq
243210
;
244211
; X32-LABEL: or32_nouse_seq_cst:
245212
; X32: # %bb.0:
246-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
247-
; X32-NEXT: mfence
248-
; X32-NEXT: movl (%eax), %eax
213+
; X32-NEXT: lock orl $0, (%esp)
249214
; X32-NEXT: retl
250215
atomicrmw or i32* %p, i32 0 seq_cst
251216
ret void
@@ -255,8 +220,7 @@ define void @or32_nouse_seq_cst(i32* %p) {
255220
define void @or64_nouse_seq_cst(i64* %p) {
256221
; X64-LABEL: or64_nouse_seq_cst:
257222
; X64: # %bb.0:
258-
; X64-NEXT: mfence
259-
; X64-NEXT: movq (%rdi), %rax
223+
; X64-NEXT: lock orl $0, (%rsp)
260224
; X64-NEXT: retq
261225
;
262226
; X32-LABEL: or64_nouse_seq_cst:
@@ -334,15 +298,12 @@ define void @or128_nouse_seq_cst(i128* %p) {
334298
define void @or16_nouse_seq_cst(i16* %p) {
335299
; X64-LABEL: or16_nouse_seq_cst:
336300
; X64: # %bb.0:
337-
; X64-NEXT: mfence
338-
; X64-NEXT: movzwl (%rdi), %eax
301+
; X64-NEXT: lock orl $0, (%rsp)
339302
; X64-NEXT: retq
340303
;
341304
; X32-LABEL: or16_nouse_seq_cst:
342305
; X32: # %bb.0:
343-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
344-
; X32-NEXT: mfence
345-
; X32-NEXT: movzwl (%eax), %eax
306+
; X32-NEXT: lock orl $0, (%esp)
346307
; X32-NEXT: retl
347308
atomicrmw or i16* %p, i16 0 seq_cst
348309
ret void
@@ -351,15 +312,12 @@ define void @or16_nouse_seq_cst(i16* %p) {
351312
define void @or8_nouse_seq_cst(i8* %p) {
352313
; X64-LABEL: or8_nouse_seq_cst:
353314
; X64: # %bb.0:
354-
; X64-NEXT: mfence
355-
; X64-NEXT: movb (%rdi), %al
315+
; X64-NEXT: lock orl $0, (%rsp)
356316
; X64-NEXT: retq
357317
;
358318
; X32-LABEL: or8_nouse_seq_cst:
359319
; X32: # %bb.0:
360-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
361-
; X32-NEXT: mfence
362-
; X32-NEXT: movb (%eax), %al
320+
; X32-NEXT: lock orl $0, (%esp)
363321
; X32-NEXT: retl
364322
atomicrmw or i8* %p, i8 0 seq_cst
365323
ret void

0 commit comments

Comments
 (0)