Skip to content

Commit 522ff6e

Browse files
committed
[X86] Prefer lock or over mfence
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction.
1 parent 729416e commit 522ff6e

File tree

5 files changed

+847
-107
lines changed

5 files changed

+847
-107
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,10 @@ def TuningUseGLMDivSqrtCosts
772772
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
773773
"Target has branch hint feature">;
774774

775+
def TuningAvoidMFENCE
776+
: SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
777+
"Avoid MFENCE for fence seq_cst, and instead use lock or">;
778+
775779
//===----------------------------------------------------------------------===//
776780
// X86 CPU Families
777781
// TODO: Remove these - use general tuning features to determine codegen.
@@ -833,7 +837,8 @@ def ProcessorFeatures {
833837
TuningSlow3OpsLEA,
834838
TuningSlowDivide64,
835839
TuningSlowIncDec,
836-
TuningInsertVZEROUPPER
840+
TuningInsertVZEROUPPER,
841+
TuningAvoidMFENCE
837842
];
838843

839844
list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
@@ -849,7 +854,8 @@ def ProcessorFeatures {
849854
TuningFastSHLDRotate,
850855
TuningFast15ByteNOP,
851856
TuningPOPCNTFalseDeps,
852-
TuningInsertVZEROUPPER
857+
TuningInsertVZEROUPPER,
858+
TuningAvoidMFENCE
853859
];
854860

855861
list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
@@ -868,7 +874,8 @@ def ProcessorFeatures {
868874
TuningPOPCNTFalseDeps,
869875
TuningLZCNTFalseDeps,
870876
TuningInsertVZEROUPPER,
871-
TuningAllowLight256Bit
877+
TuningAllowLight256Bit,
878+
TuningAvoidMFENCE
872879
];
873880

874881
list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
@@ -892,15 +899,17 @@ def ProcessorFeatures {
892899
TuningFastGather,
893900
TuningPOPCNTFalseDeps,
894901
TuningInsertVZEROUPPER,
895-
TuningAllowLight256Bit
902+
TuningAllowLight256Bit,
903+
TuningAvoidMFENCE
896904
];
897905

898906
// Nehalem
899907
list<SubtargetFeature> NHMFeatures = X86_64V2Features;
900908
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
901909
TuningSlowDivide64,
902910
TuningInsertVZEROUPPER,
903-
TuningNoDomainDelayMov];
911+
TuningNoDomainDelayMov,
912+
TuningAvoidMFENCE];
904913

905914
// Westmere
906915
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -921,7 +930,8 @@ def ProcessorFeatures {
921930
TuningFast15ByteNOP,
922931
TuningPOPCNTFalseDeps,
923932
TuningInsertVZEROUPPER,
924-
TuningNoDomainDelayMov];
933+
TuningNoDomainDelayMov,
934+
TuningAvoidMFENCE];
925935
list<SubtargetFeature> SNBFeatures =
926936
!listconcat(WSMFeatures, SNBAdditionalFeatures);
927937

@@ -987,7 +997,8 @@ def ProcessorFeatures {
987997
TuningAllowLight256Bit,
988998
TuningNoDomainDelayMov,
989999
TuningNoDomainDelayShuffle,
990-
TuningNoDomainDelayBlend];
1000+
TuningNoDomainDelayBlend,
1001+
TuningAvoidMFENCE];
9911002
list<SubtargetFeature> SKLFeatures =
9921003
!listconcat(BDWFeatures, SKLAdditionalFeatures);
9931004

@@ -1022,7 +1033,8 @@ def ProcessorFeatures {
10221033
TuningNoDomainDelayMov,
10231034
TuningNoDomainDelayShuffle,
10241035
TuningNoDomainDelayBlend,
1025-
TuningFastImmVectorShift];
1036+
TuningFastImmVectorShift,
1037+
TuningAvoidMFENCE];
10261038
list<SubtargetFeature> SKXFeatures =
10271039
!listconcat(BDWFeatures, SKXAdditionalFeatures);
10281040

@@ -1065,7 +1077,8 @@ def ProcessorFeatures {
10651077
TuningNoDomainDelayMov,
10661078
TuningNoDomainDelayShuffle,
10671079
TuningNoDomainDelayBlend,
1068-
TuningFastImmVectorShift];
1080+
TuningFastImmVectorShift,
1081+
TuningAvoidMFENCE];
10691082
list<SubtargetFeature> CNLFeatures =
10701083
!listconcat(SKLFeatures, CNLAdditionalFeatures);
10711084

@@ -1094,7 +1107,8 @@ def ProcessorFeatures {
10941107
TuningNoDomainDelayMov,
10951108
TuningNoDomainDelayShuffle,
10961109
TuningNoDomainDelayBlend,
1097-
TuningFastImmVectorShift];
1110+
TuningFastImmVectorShift,
1111+
TuningAvoidMFENCE];
10981112
list<SubtargetFeature> ICLFeatures =
10991113
!listconcat(CNLFeatures, ICLAdditionalFeatures);
11001114

@@ -1268,7 +1282,8 @@ def ProcessorFeatures {
12681282
// Tremont
12691283
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
12701284
FeatureGFNI];
1271-
list<SubtargetFeature> TRMTuning = GLPTuning;
1285+
list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
1286+
list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
12721287
list<SubtargetFeature> TRMFeatures =
12731288
!listconcat(GLPFeatures, TRMAdditionalFeatures);
12741289

@@ -1446,7 +1461,8 @@ def ProcessorFeatures {
14461461
TuningFastImm16,
14471462
TuningSBBDepBreaking,
14481463
TuningSlowDivide64,
1449-
TuningSlowSHLD];
1464+
TuningSlowSHLD,
1465+
TuningAvoidMFENCE];
14501466
list<SubtargetFeature> BtVer2Features =
14511467
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
14521468

@@ -1475,7 +1491,8 @@ def ProcessorFeatures {
14751491
TuningFastScalarShiftMasks,
14761492
TuningBranchFusion,
14771493
TuningSBBDepBreaking,
1478-
TuningInsertVZEROUPPER];
1494+
TuningInsertVZEROUPPER,
1495+
TuningAvoidMFENCE];
14791496

14801497
// PileDriver
14811498
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1555,7 +1572,8 @@ def ProcessorFeatures {
15551572
TuningSlowSHLD,
15561573
TuningSBBDepBreaking,
15571574
TuningInsertVZEROUPPER,
1558-
TuningAllowLight256Bit];
1575+
TuningAllowLight256Bit,
1576+
TuningAvoidMFENCE];
15591577
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
15601578
FeatureRDPID,
15611579
FeatureRDPRU,
@@ -1740,7 +1758,8 @@ def : ProcModel<P, SandyBridgeModel, [
17401758
[
17411759
TuningMacroFusion,
17421760
TuningSlowUAMem16,
1743-
TuningInsertVZEROUPPER
1761+
TuningInsertVZEROUPPER,
1762+
TuningAvoidMFENCE
17441763
]>;
17451764
}
17461765
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1759,7 +1778,8 @@ def : ProcModel<P, SandyBridgeModel, [
17591778
[
17601779
TuningMacroFusion,
17611780
TuningSlowUAMem16,
1762-
TuningInsertVZEROUPPER
1781+
TuningInsertVZEROUPPER,
1782+
TuningAvoidMFENCE
17631783
]>;
17641784
}
17651785

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31808,21 +31808,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
3180831808
// otherwise, we might be able to be more aggressive on relaxed idempotent
3180931809
// rmw. In practice, they do not look useful, so we don't try to be
3181031810
// especially clever.
31811-
if (SSID == SyncScope::SingleThread)
31812-
// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
31813-
// the IR level, so we must wrap it in an intrinsic.
31814-
return nullptr;
31815-
31816-
if (!Subtarget.hasMFence())
31817-
// FIXME: it might make sense to use a locked operation here but on a
31818-
// different cache-line to prevent cache-line bouncing. In practice it
31819-
// is probably a small win, and x86 processors without mfence are rare
31820-
// enough that we do not bother.
31821-
return nullptr;
3182231811

31823-
Function *MFence =
31824-
llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence);
31825-
Builder.CreateCall(MFence, {});
31812+
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
31813+
// lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
31814+
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
3182631815

3182731816
// Finally we can emit the atomic load.
3182831817
LoadInst *Loaded = Builder.CreateAlignedLoad(
@@ -31910,7 +31899,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
3191031899
// cross-thread fence.
3191131900
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
3191231901
FenceSSID == SyncScope::System) {
31913-
if (Subtarget.hasMFence())
31902+
if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
3191431903
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
3191531904

3191631905
SDValue Chain = Op.getOperand(0);

llvm/test/CodeGen/X86/atomic-idempotent.ll

Lines changed: 30 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,16 @@ define i8 @add8(ptr %p) {
2727
;
2828
; X86-SLM-LABEL: add8:
2929
; X86-SLM: # %bb.0:
30-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
31-
; X86-SLM-NEXT: xorl %eax, %eax
32-
; X86-SLM-NEXT: lock xaddb %al, (%ecx)
33-
; X86-SLM-NEXT: # kill: def $al killed $al killed $eax
30+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
31+
; X86-SLM-NEXT: lock orl $0, (%esp)
32+
; X86-SLM-NEXT: movzbl (%eax), %eax
3433
; X86-SLM-NEXT: retl
3534
;
3635
; X86-ATOM-LABEL: add8:
3736
; X86-ATOM: # %bb.0:
38-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
39-
; X86-ATOM-NEXT: xorl %eax, %eax
40-
; X86-ATOM-NEXT: lock xaddb %al, (%ecx)
41-
; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax
37+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
38+
; X86-ATOM-NEXT: lock orl $0, (%esp)
39+
; X86-ATOM-NEXT: movzbl (%eax), %eax
4240
; X86-ATOM-NEXT: nop
4341
; X86-ATOM-NEXT: nop
4442
; X86-ATOM-NEXT: retl
@@ -62,26 +60,18 @@ define i16 @or16(ptr %p) {
6260
;
6361
; X86-SLM-LABEL: or16:
6462
; X86-SLM: # %bb.0:
65-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
66-
; X86-SLM-NEXT: movzwl (%ecx), %eax
67-
; X86-SLM-NEXT: .p2align 4
68-
; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start
69-
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
70-
; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx)
71-
; X86-SLM-NEXT: jne .LBB1_1
72-
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
63+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
64+
; X86-SLM-NEXT: lock orl $0, (%esp)
65+
; X86-SLM-NEXT: movzwl (%eax), %eax
7366
; X86-SLM-NEXT: retl
7467
;
7568
; X86-ATOM-LABEL: or16:
7669
; X86-ATOM: # %bb.0:
77-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
78-
; X86-ATOM-NEXT: movzwl (%ecx), %eax
79-
; X86-ATOM-NEXT: .p2align 4
80-
; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start
81-
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
82-
; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx)
83-
; X86-ATOM-NEXT: jne .LBB1_1
84-
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
70+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
71+
; X86-ATOM-NEXT: lock orl $0, (%esp)
72+
; X86-ATOM-NEXT: movzwl (%eax), %eax
73+
; X86-ATOM-NEXT: nop
74+
; X86-ATOM-NEXT: nop
8575
; X86-ATOM-NEXT: retl
8676
%1 = atomicrmw or ptr %p, i16 0 acquire
8777
ret i16 %1
@@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) {
10393
;
10494
; X86-SLM-LABEL: xor32:
10595
; X86-SLM: # %bb.0:
106-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
107-
; X86-SLM-NEXT: movl (%ecx), %eax
108-
; X86-SLM-NEXT: .p2align 4
109-
; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start
110-
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
111-
; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
112-
; X86-SLM-NEXT: jne .LBB2_1
113-
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
96+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
97+
; X86-SLM-NEXT: lock orl $0, (%esp)
98+
; X86-SLM-NEXT: movl (%eax), %eax
11499
; X86-SLM-NEXT: retl
115100
;
116101
; X86-ATOM-LABEL: xor32:
117102
; X86-ATOM: # %bb.0:
118-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
119-
; X86-ATOM-NEXT: movl (%ecx), %eax
120-
; X86-ATOM-NEXT: .p2align 4
121-
; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start
122-
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
123-
; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
124-
; X86-ATOM-NEXT: jne .LBB2_1
125-
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
103+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
104+
; X86-ATOM-NEXT: lock orl $0, (%esp)
105+
; X86-ATOM-NEXT: movl (%eax), %eax
106+
; X86-ATOM-NEXT: nop
107+
; X86-ATOM-NEXT: nop
126108
; X86-ATOM-NEXT: retl
127109
%1 = atomicrmw xor ptr %p, i32 0 release
128110
ret i32 %1
@@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) {
318300
;
319301
; X86-SLM-LABEL: and32:
320302
; X86-SLM: # %bb.0:
321-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
322-
; X86-SLM-NEXT: movl (%ecx), %eax
323-
; X86-SLM-NEXT: .p2align 4
324-
; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start
325-
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
326-
; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
327-
; X86-SLM-NEXT: jne .LBB5_1
328-
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
303+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
304+
; X86-SLM-NEXT: lock orl $0, (%esp)
305+
; X86-SLM-NEXT: movl (%eax), %eax
329306
; X86-SLM-NEXT: retl
330307
;
331308
; X86-ATOM-LABEL: and32:
332309
; X86-ATOM: # %bb.0:
333-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
334-
; X86-ATOM-NEXT: movl (%ecx), %eax
335-
; X86-ATOM-NEXT: .p2align 4
336-
; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start
337-
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
338-
; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
339-
; X86-ATOM-NEXT: jne .LBB5_1
340-
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
310+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
311+
; X86-ATOM-NEXT: lock orl $0, (%esp)
312+
; X86-ATOM-NEXT: movl (%eax), %eax
313+
; X86-ATOM-NEXT: nop
314+
; X86-ATOM-NEXT: nop
341315
; X86-ATOM-NEXT: retl
342316
%1 = atomicrmw and ptr %p, i32 -1 acq_rel
343317
ret i32 %1

0 commit comments

Comments
 (0)