Skip to content

Commit 4d502dd

Browse files
committed
[X86] Prefer lock or over mfence
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction.
1 parent bfde178 commit 4d502dd

File tree

5 files changed

+845
-106
lines changed

5 files changed

+845
-106
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,10 @@ def TuningUseGLMDivSqrtCosts
754754
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
755755
"Target has branch hint feature">;
756756

757+
def TuningAvoidMFENCE
758+
: SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
759+
"Avoid MFENCE for fence seq_cst, and instead use lock or">;
760+
757761
//===----------------------------------------------------------------------===//
758762
// X86 CPU Families
759763
// TODO: Remove these - use general tuning features to determine codegen.
@@ -815,7 +819,8 @@ def ProcessorFeatures {
815819
TuningSlow3OpsLEA,
816820
TuningSlowDivide64,
817821
TuningSlowIncDec,
818-
TuningInsertVZEROUPPER
822+
TuningInsertVZEROUPPER,
823+
TuningAvoidMFENCE
819824
];
820825

821826
list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
@@ -831,7 +836,8 @@ def ProcessorFeatures {
831836
TuningFastSHLDRotate,
832837
TuningFast15ByteNOP,
833838
TuningPOPCNTFalseDeps,
834-
TuningInsertVZEROUPPER
839+
TuningInsertVZEROUPPER,
840+
TuningAvoidMFENCE
835841
];
836842

837843
list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
@@ -850,7 +856,8 @@ def ProcessorFeatures {
850856
TuningPOPCNTFalseDeps,
851857
TuningLZCNTFalseDeps,
852858
TuningInsertVZEROUPPER,
853-
TuningAllowLight256Bit
859+
TuningAllowLight256Bit,
860+
TuningAvoidMFENCE
854861
];
855862

856863
list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
@@ -874,15 +881,17 @@ def ProcessorFeatures {
874881
TuningFastGather,
875882
TuningPOPCNTFalseDeps,
876883
TuningInsertVZEROUPPER,
877-
TuningAllowLight256Bit
884+
TuningAllowLight256Bit,
885+
TuningAvoidMFENCE
878886
];
879887

880888
// Nehalem
881889
list<SubtargetFeature> NHMFeatures = X86_64V2Features;
882890
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
883891
TuningSlowDivide64,
884892
TuningInsertVZEROUPPER,
885-
TuningNoDomainDelayMov];
893+
TuningNoDomainDelayMov,
894+
TuningAvoidMFENCE];
886895

887896
// Westmere
888897
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -903,7 +912,8 @@ def ProcessorFeatures {
903912
TuningFast15ByteNOP,
904913
TuningPOPCNTFalseDeps,
905914
TuningInsertVZEROUPPER,
906-
TuningNoDomainDelayMov];
915+
TuningNoDomainDelayMov,
916+
TuningAvoidMFENCE];
907917
list<SubtargetFeature> SNBFeatures =
908918
!listconcat(WSMFeatures, SNBAdditionalFeatures);
909919

@@ -969,7 +979,8 @@ def ProcessorFeatures {
969979
TuningAllowLight256Bit,
970980
TuningNoDomainDelayMov,
971981
TuningNoDomainDelayShuffle,
972-
TuningNoDomainDelayBlend];
982+
TuningNoDomainDelayBlend,
983+
TuningAvoidMFENCE];
973984
list<SubtargetFeature> SKLFeatures =
974985
!listconcat(BDWFeatures, SKLAdditionalFeatures);
975986

@@ -1004,7 +1015,8 @@ def ProcessorFeatures {
10041015
TuningNoDomainDelayMov,
10051016
TuningNoDomainDelayShuffle,
10061017
TuningNoDomainDelayBlend,
1007-
TuningFastImmVectorShift];
1018+
TuningFastImmVectorShift,
1019+
TuningAvoidMFENCE];
10081020
list<SubtargetFeature> SKXFeatures =
10091021
!listconcat(BDWFeatures, SKXAdditionalFeatures);
10101022

@@ -1047,7 +1059,8 @@ def ProcessorFeatures {
10471059
TuningNoDomainDelayMov,
10481060
TuningNoDomainDelayShuffle,
10491061
TuningNoDomainDelayBlend,
1050-
TuningFastImmVectorShift];
1062+
TuningFastImmVectorShift,
1063+
TuningAvoidMFENCE];
10511064
list<SubtargetFeature> CNLFeatures =
10521065
!listconcat(SKLFeatures, CNLAdditionalFeatures);
10531066

@@ -1076,7 +1089,8 @@ def ProcessorFeatures {
10761089
TuningNoDomainDelayMov,
10771090
TuningNoDomainDelayShuffle,
10781091
TuningNoDomainDelayBlend,
1079-
TuningFastImmVectorShift];
1092+
TuningFastImmVectorShift,
1093+
TuningAvoidMFENCE];
10801094
list<SubtargetFeature> ICLFeatures =
10811095
!listconcat(CNLFeatures, ICLAdditionalFeatures);
10821096

@@ -1222,7 +1236,8 @@ def ProcessorFeatures {
12221236
// Tremont
12231237
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
12241238
FeatureGFNI];
1225-
list<SubtargetFeature> TRMTuning = GLPTuning;
1239+
list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
1240+
list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
12261241
list<SubtargetFeature> TRMFeatures =
12271242
!listconcat(GLPFeatures, TRMAdditionalFeatures);
12281243

@@ -1429,7 +1444,8 @@ def ProcessorFeatures {
14291444
TuningFastScalarShiftMasks,
14301445
TuningBranchFusion,
14311446
TuningSBBDepBreaking,
1432-
TuningInsertVZEROUPPER];
1447+
TuningInsertVZEROUPPER,
1448+
TuningAvoidMFENCE];
14331449

14341450
// PileDriver
14351451
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1509,7 +1525,8 @@ def ProcessorFeatures {
15091525
TuningSlowSHLD,
15101526
TuningSBBDepBreaking,
15111527
TuningInsertVZEROUPPER,
1512-
TuningAllowLight256Bit];
1528+
TuningAllowLight256Bit,
1529+
TuningAvoidMFENCE];
15131530
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
15141531
FeatureRDPID,
15151532
FeatureRDPRU,
@@ -1697,7 +1714,8 @@ def : ProcModel<P, SandyBridgeModel, [
16971714
[
16981715
TuningMacroFusion,
16991716
TuningSlowUAMem16,
1700-
TuningInsertVZEROUPPER
1717+
TuningInsertVZEROUPPER,
1718+
TuningAvoidMFENCE
17011719
]>;
17021720
}
17031721
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1716,7 +1734,8 @@ def : ProcModel<P, SandyBridgeModel, [
17161734
[
17171735
TuningMacroFusion,
17181736
TuningSlowUAMem16,
1719-
TuningInsertVZEROUPPER
1737+
TuningInsertVZEROUPPER,
1738+
TuningAvoidMFENCE
17201739
]>;
17211740
}
17221741

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31422,21 +31422,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
3142231422
// otherwise, we might be able to be more aggressive on relaxed idempotent
3142331423
// rmw. In practice, they do not look useful, so we don't try to be
3142431424
// especially clever.
31425-
if (SSID == SyncScope::SingleThread)
31426-
// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
31427-
// the IR level, so we must wrap it in an intrinsic.
31428-
return nullptr;
31429-
31430-
if (!Subtarget.hasMFence())
31431-
// FIXME: it might make sense to use a locked operation here but on a
31432-
// different cache-line to prevent cache-line bouncing. In practice it
31433-
// is probably a small win, and x86 processors without mfence are rare
31434-
// enough that we do not bother.
31435-
return nullptr;
3143631425

31437-
Function *MFence =
31438-
llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
31439-
Builder.CreateCall(MFence, {});
31426+
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
31427+
// lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
31428+
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
3144031429

3144131430
// Finally we can emit the atomic load.
3144231431
LoadInst *Loaded = Builder.CreateAlignedLoad(
@@ -31524,7 +31513,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
3152431513
// cross-thread fence.
3152531514
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
3152631515
FenceSSID == SyncScope::System) {
31527-
if (Subtarget.hasMFence())
31516+
if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
3152831517
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
3152931518

3153031519
SDValue Chain = Op.getOperand(0);

llvm/test/CodeGen/X86/atomic-idempotent.ll

Lines changed: 30 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,16 @@ define i8 @add8(ptr %p) {
2727
;
2828
; X86-SLM-LABEL: add8:
2929
; X86-SLM: # %bb.0:
30-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
31-
; X86-SLM-NEXT: xorl %eax, %eax
32-
; X86-SLM-NEXT: lock xaddb %al, (%ecx)
33-
; X86-SLM-NEXT: # kill: def $al killed $al killed $eax
30+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
31+
; X86-SLM-NEXT: lock orl $0, (%esp)
32+
; X86-SLM-NEXT: movzbl (%eax), %eax
3433
; X86-SLM-NEXT: retl
3534
;
3635
; X86-ATOM-LABEL: add8:
3736
; X86-ATOM: # %bb.0:
38-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
39-
; X86-ATOM-NEXT: xorl %eax, %eax
40-
; X86-ATOM-NEXT: lock xaddb %al, (%ecx)
41-
; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax
37+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
38+
; X86-ATOM-NEXT: lock orl $0, (%esp)
39+
; X86-ATOM-NEXT: movzbl (%eax), %eax
4240
; X86-ATOM-NEXT: nop
4341
; X86-ATOM-NEXT: nop
4442
; X86-ATOM-NEXT: retl
@@ -62,26 +60,18 @@ define i16 @or16(ptr %p) {
6260
;
6361
; X86-SLM-LABEL: or16:
6462
; X86-SLM: # %bb.0:
65-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
66-
; X86-SLM-NEXT: movzwl (%ecx), %eax
67-
; X86-SLM-NEXT: .p2align 4, 0x90
68-
; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start
69-
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
70-
; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx)
71-
; X86-SLM-NEXT: jne .LBB1_1
72-
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
63+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
64+
; X86-SLM-NEXT: lock orl $0, (%esp)
65+
; X86-SLM-NEXT: movzwl (%eax), %eax
7366
; X86-SLM-NEXT: retl
7467
;
7568
; X86-ATOM-LABEL: or16:
7669
; X86-ATOM: # %bb.0:
77-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
78-
; X86-ATOM-NEXT: movzwl (%ecx), %eax
79-
; X86-ATOM-NEXT: .p2align 4, 0x90
80-
; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start
81-
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
82-
; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx)
83-
; X86-ATOM-NEXT: jne .LBB1_1
84-
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
70+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
71+
; X86-ATOM-NEXT: lock orl $0, (%esp)
72+
; X86-ATOM-NEXT: movzwl (%eax), %eax
73+
; X86-ATOM-NEXT: nop
74+
; X86-ATOM-NEXT: nop
8575
; X86-ATOM-NEXT: retl
8676
%1 = atomicrmw or ptr %p, i16 0 acquire
8777
ret i16 %1
@@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) {
10393
;
10494
; X86-SLM-LABEL: xor32:
10595
; X86-SLM: # %bb.0:
106-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
107-
; X86-SLM-NEXT: movl (%ecx), %eax
108-
; X86-SLM-NEXT: .p2align 4, 0x90
109-
; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start
110-
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
111-
; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
112-
; X86-SLM-NEXT: jne .LBB2_1
113-
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
96+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
97+
; X86-SLM-NEXT: lock orl $0, (%esp)
98+
; X86-SLM-NEXT: movl (%eax), %eax
11499
; X86-SLM-NEXT: retl
115100
;
116101
; X86-ATOM-LABEL: xor32:
117102
; X86-ATOM: # %bb.0:
118-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
119-
; X86-ATOM-NEXT: movl (%ecx), %eax
120-
; X86-ATOM-NEXT: .p2align 4, 0x90
121-
; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start
122-
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
123-
; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
124-
; X86-ATOM-NEXT: jne .LBB2_1
125-
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
103+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
104+
; X86-ATOM-NEXT: lock orl $0, (%esp)
105+
; X86-ATOM-NEXT: movl (%eax), %eax
106+
; X86-ATOM-NEXT: nop
107+
; X86-ATOM-NEXT: nop
126108
; X86-ATOM-NEXT: retl
127109
%1 = atomicrmw xor ptr %p, i32 0 release
128110
ret i32 %1
@@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) {
318300
;
319301
; X86-SLM-LABEL: and32:
320302
; X86-SLM: # %bb.0:
321-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
322-
; X86-SLM-NEXT: movl (%ecx), %eax
323-
; X86-SLM-NEXT: .p2align 4, 0x90
324-
; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start
325-
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
326-
; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
327-
; X86-SLM-NEXT: jne .LBB5_1
328-
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
303+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
304+
; X86-SLM-NEXT: lock orl $0, (%esp)
305+
; X86-SLM-NEXT: movl (%eax), %eax
329306
; X86-SLM-NEXT: retl
330307
;
331308
; X86-ATOM-LABEL: and32:
332309
; X86-ATOM: # %bb.0:
333-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
334-
; X86-ATOM-NEXT: movl (%ecx), %eax
335-
; X86-ATOM-NEXT: .p2align 4, 0x90
336-
; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start
337-
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
338-
; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
339-
; X86-ATOM-NEXT: jne .LBB5_1
340-
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
310+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
311+
; X86-ATOM-NEXT: lock orl $0, (%esp)
312+
; X86-ATOM-NEXT: movl (%eax), %eax
313+
; X86-ATOM-NEXT: nop
314+
; X86-ATOM-NEXT: nop
341315
; X86-ATOM-NEXT: retl
342316
%1 = atomicrmw and ptr %p, i32 -1 acq_rel
343317
ret i32 %1

0 commit comments

Comments
 (0)