@@ -25727,6 +25727,14 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
25727
25727
if (MemType->getPrimitiveSizeInBits() > NativeWidth)
25728
25728
return nullptr;
25729
25729
25730
+ // If this is a canonical idempotent atomicrmw w/no uses, we have a better
25731
+ // lowering available in lowerAtomicArith.
25732
+ // TODO: push more cases through this path.
25733
+ if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
25734
+ if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
25735
+ AI->use_empty())
25736
+ return nullptr;
25737
+
25730
25738
auto Builder = IRBuilder<>(AI);
25731
25739
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25732
25740
auto SSID = AI->getSyncScopeID();
@@ -26223,6 +26231,59 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
26223
26231
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
26224
26232
}
26225
26233
26234
+ /// Emit a locked operation on a stack location which does not change any
26235
+ /// memory location, but does involve a lock prefix. Location is chosen to be
26236
+ /// a) very likely accessed only by a single thread to minimize cache traffic,
26237
+ /// and b) definitely dereferenceable. Returns the new Chain result.
26238
+ static SDValue emitLockedStackOp(SelectionDAG &DAG,
26239
+ const X86Subtarget &Subtarget,
26240
+ SDValue Chain, SDLoc DL) {
26241
+ // Implementation notes:
26242
+ // 1) LOCK prefix creates a full read/write reordering barrier for memory
26243
+ // operations issued by the current processor. As such, the location
26244
+ // referenced is not relevant for the ordering properties of the instruction.
26245
+ // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
26246
+ // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
26247
+ // 2) Using an immediate operand appears to be the best encoding choice
26248
+ // here since it doesn't require an extra register.
26249
+ // 3) OR appears to be very slightly faster than ADD. (Though, the difference
26250
+ // is small enough it might just be measurement noise.)
26251
+ // 4) For the moment, we are using top of stack. This creates false sharing
26252
+ // with actual stack access/call sequences, and it would be better to use a
26253
+ // location within the redzone. For the moment, this is still better than an
26254
+ // mfence though. TODO: Revise the offset used when we can assume a redzone.
26255
+ //
26256
+ // For a general discussion of the tradeoffs and benchmark results, see:
26257
+ // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
26258
+
26259
+ if (Subtarget.is64Bit()) {
26260
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i8);
26261
+ SDValue Ops[] = {
26262
+ DAG.getRegister(X86::RSP, MVT::i64), // Base
26263
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
26264
+ DAG.getRegister(0, MVT::i64), // Index
26265
+ DAG.getTargetConstant(0, DL, MVT::i32), // Disp
26266
+ DAG.getRegister(0, MVT::i32), // Segment.
26267
+ Zero,
26268
+ Chain};
26269
+ SDNode *Res = DAG.getMachineNode(X86::LOCK_OR32mi8, DL, MVT::Other, Ops);
26270
+ return SDValue(Res, 0);
26271
+ }
26272
+
26273
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
26274
+ SDValue Ops[] = {
26275
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
26276
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
26277
+ DAG.getRegister(0, MVT::i32), // Index
26278
+ DAG.getTargetConstant(0, DL, MVT::i32), // Disp
26279
+ DAG.getRegister(0, MVT::i32), // Segment.
26280
+ Zero,
26281
+ Chain
26282
+ };
26283
+ SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::Other, Ops);
26284
+ return SDValue(Res, 0);
26285
+ }
26286
+
26226
26287
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
26227
26288
const X86Subtarget &Subtarget) {
26228
26289
unsigned NewOpc = 0;
@@ -26257,6 +26318,7 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
26257
26318
/// Lower atomic_load_ops into LOCK-prefixed operations.
26258
26319
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
26259
26320
const X86Subtarget &Subtarget) {
26321
+ AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
26260
26322
SDValue Chain = N->getOperand(0);
26261
26323
SDValue LHS = N->getOperand(1);
26262
26324
SDValue RHS = N->getOperand(2);
@@ -26271,7 +26333,6 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
26271
26333
// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
26272
26334
// select LXADD if LOCK_SUB can't be selected.
26273
26335
if (Opc == ISD::ATOMIC_LOAD_SUB) {
26274
- AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
26275
26336
RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
26276
26337
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
26277
26338
RHS, AN->getMemOperand());
@@ -26281,6 +26342,32 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
26281
26342
return N;
26282
26343
}
26283
26344
26345
+ // Specialized lowering for the canonical form of an idemptotent atomicrmw.
26346
+ // The core idea here is that since the memory location isn't actually
26347
+ // changing, all we need is a lowering for the *ordering* impacts of the
26348
+ // atomicrmw. As such, we can chose a different operation and memory
26349
+ // location to minimize impact on other code.
26350
+ if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
26351
+ // On X86, the only ordering which actually requires an instruction is
26352
+ // seq_cst which isn't SingleThread, everything just needs to be preserved
26353
+ // during codegen and then dropped. Note that we expect (but don't assume),
26354
+ // that orderings other than seq_cst and acq_rel have been canonicalized to
26355
+ // a store or load.
26356
+ if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
26357
+ AN->getSyncScopeID() == SyncScope::System) {
26358
+ // Prefer a locked operation against a stack location to minimize cache
26359
+ // traffic. This assumes that stack locations are very likely to be
26360
+ // accessed only by the owning thread.
26361
+ SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
26362
+ DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain);
26363
+ return SDValue();
26364
+ }
26365
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
26366
+ SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
26367
+ DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain);
26368
+ return SDValue();
26369
+ }
26370
+
26284
26371
SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
26285
26372
// RAUW the chain, but don't worry about the result, as it's unused.
26286
26373
assert(!N->hasAnyUseOfValue(0));
0 commit comments