Skip to content

Commit beec8ab

Browse files
committed
[X86] Prefer lock or over mfence
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction.
1 parent fa93be4 commit beec8ab

File tree

3 files changed

+34
-18
lines changed

3 files changed

+34
-18
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,10 @@ def TuningUseGLMDivSqrtCosts
754754
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
755755
"Target has branch hint feature">;
756756

757+
def TuningAvoidMFENCE
758+
: SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
759+
"Avoid MFENCE for fence seq_cst, and instead use lock or">;
760+
757761
//===----------------------------------------------------------------------===//
758762
// X86 CPU Families
759763
// TODO: Remove these - use general tuning features to determine codegen.
@@ -882,7 +886,8 @@ def ProcessorFeatures {
882886
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
883887
TuningSlowDivide64,
884888
TuningInsertVZEROUPPER,
885-
TuningNoDomainDelayMov];
889+
TuningNoDomainDelayMov,
890+
TuningAvoidMFENCE];
886891

887892
// Westmere
888893
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -903,7 +908,8 @@ def ProcessorFeatures {
903908
TuningFast15ByteNOP,
904909
TuningPOPCNTFalseDeps,
905910
TuningInsertVZEROUPPER,
906-
TuningNoDomainDelayMov];
911+
TuningNoDomainDelayMov,
912+
TuningAvoidMFENCE];
907913
list<SubtargetFeature> SNBFeatures =
908914
!listconcat(WSMFeatures, SNBAdditionalFeatures);
909915

@@ -969,7 +975,8 @@ def ProcessorFeatures {
969975
TuningAllowLight256Bit,
970976
TuningNoDomainDelayMov,
971977
TuningNoDomainDelayShuffle,
972-
TuningNoDomainDelayBlend];
978+
TuningNoDomainDelayBlend,
979+
TuningAvoidMFENCE];
973980
list<SubtargetFeature> SKLFeatures =
974981
!listconcat(BDWFeatures, SKLAdditionalFeatures);
975982

@@ -1004,7 +1011,8 @@ def ProcessorFeatures {
10041011
TuningNoDomainDelayMov,
10051012
TuningNoDomainDelayShuffle,
10061013
TuningNoDomainDelayBlend,
1007-
TuningFastImmVectorShift];
1014+
TuningFastImmVectorShift,
1015+
TuningAvoidMFENCE];
10081016
list<SubtargetFeature> SKXFeatures =
10091017
!listconcat(BDWFeatures, SKXAdditionalFeatures);
10101018

@@ -1047,7 +1055,8 @@ def ProcessorFeatures {
10471055
TuningNoDomainDelayMov,
10481056
TuningNoDomainDelayShuffle,
10491057
TuningNoDomainDelayBlend,
1050-
TuningFastImmVectorShift];
1058+
TuningFastImmVectorShift,
1059+
TuningAvoidMFENCE];
10511060
list<SubtargetFeature> CNLFeatures =
10521061
!listconcat(SKLFeatures, CNLAdditionalFeatures);
10531062

@@ -1076,7 +1085,8 @@ def ProcessorFeatures {
10761085
TuningNoDomainDelayMov,
10771086
TuningNoDomainDelayShuffle,
10781087
TuningNoDomainDelayBlend,
1079-
TuningFastImmVectorShift];
1088+
TuningFastImmVectorShift,
1089+
TuningAvoidMFENCE];
10801090
list<SubtargetFeature> ICLFeatures =
10811091
!listconcat(CNLFeatures, ICLAdditionalFeatures);
10821092

@@ -1222,7 +1232,8 @@ def ProcessorFeatures {
12221232
// Tremont
12231233
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
12241234
FeatureGFNI];
1225-
list<SubtargetFeature> TRMTuning = GLPTuning;
1235+
list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
1236+
list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
12261237
list<SubtargetFeature> TRMFeatures =
12271238
!listconcat(GLPFeatures, TRMAdditionalFeatures);
12281239

@@ -1429,7 +1440,8 @@ def ProcessorFeatures {
14291440
TuningFastScalarShiftMasks,
14301441
TuningBranchFusion,
14311442
TuningSBBDepBreaking,
1432-
TuningInsertVZEROUPPER];
1443+
TuningInsertVZEROUPPER,
1444+
TuningAvoidMFENCE];
14331445

14341446
// PileDriver
14351447
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1509,7 +1521,8 @@ def ProcessorFeatures {
15091521
TuningSlowSHLD,
15101522
TuningSBBDepBreaking,
15111523
TuningInsertVZEROUPPER,
1512-
TuningAllowLight256Bit];
1524+
TuningAllowLight256Bit,
1525+
TuningAvoidMFENCE];
15131526
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
15141527
FeatureRDPID,
15151528
FeatureRDPRU,
@@ -1664,7 +1677,8 @@ def : ProcModel<"nocona", GenericPostRAModel, [
16641677
],
16651678
[
16661679
TuningSlowUAMem16,
1667-
TuningInsertVZEROUPPER
1680+
TuningInsertVZEROUPPER,
1681+
TuningAvoidMFENCE
16681682
]>;
16691683

16701684
// Intel Core 2 Solo/Duo.
@@ -1684,7 +1698,8 @@ def : ProcModel<P, SandyBridgeModel, [
16841698
[
16851699
TuningMacroFusion,
16861700
TuningSlowUAMem16,
1687-
TuningInsertVZEROUPPER
1701+
TuningInsertVZEROUPPER,
1702+
TuningAvoidMFENCE
16881703
]>;
16891704
}
16901705
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1703,7 +1718,8 @@ def : ProcModel<P, SandyBridgeModel, [
17031718
[
17041719
TuningMacroFusion,
17051720
TuningSlowUAMem16,
1706-
TuningInsertVZEROUPPER
1721+
TuningInsertVZEROUPPER,
1722+
TuningAvoidMFENCE
17071723
]>;
17081724
}
17091725

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31103,7 +31103,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
3110331103
// cross-thread fence.
3110431104
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
3110531105
FenceSSID == SyncScope::System) {
31106-
if (Subtarget.hasMFence())
31106+
if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
3110731107
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
3110831108

3110931109
SDValue Chain = Op.getOperand(0);

llvm/test/CodeGen/X86/atomic-unordered.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2096,7 +2096,7 @@ define i64 @nofold_fence(ptr %p) {
20962096
; CHECK-LABEL: nofold_fence:
20972097
; CHECK: # %bb.0:
20982098
; CHECK-NEXT: movq (%rdi), %rax
2099-
; CHECK-NEXT: mfence
2099+
; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
21002100
; CHECK-NEXT: addq $15, %rax
21012101
; CHECK-NEXT: retq
21022102
%v = load atomic i64, ptr %p unordered, align 8
@@ -2170,7 +2170,7 @@ define i64 @fold_constant_fence(i64 %arg) {
21702170
; CHECK-LABEL: fold_constant_fence:
21712171
; CHECK: # %bb.0:
21722172
; CHECK-NEXT: movq Constant(%rip), %rax
2173-
; CHECK-NEXT: mfence
2173+
; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
21742174
; CHECK-NEXT: addq %rdi, %rax
21752175
; CHECK-NEXT: retq
21762176
%v = load atomic i64, ptr @Constant unordered, align 8
@@ -2197,7 +2197,7 @@ define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) {
21972197
; CHECK-LABEL: fold_invariant_fence:
21982198
; CHECK: # %bb.0:
21992199
; CHECK-NEXT: movq (%rdi), %rax
2200-
; CHECK-NEXT: mfence
2200+
; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
22012201
; CHECK-NEXT: addq %rsi, %rax
22022202
; CHECK-NEXT: retq
22032203
%v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{}
@@ -2321,7 +2321,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
23212321
; CHECK-O0-LABEL: fold_cmp_over_fence:
23222322
; CHECK-O0: # %bb.0:
23232323
; CHECK-O0-NEXT: movl (%rdi), %eax
2324-
; CHECK-O0-NEXT: mfence
2324+
; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
23252325
; CHECK-O0-NEXT: cmpl %eax, %esi
23262326
; CHECK-O0-NEXT: jne .LBB116_2
23272327
; CHECK-O0-NEXT: # %bb.1: # %taken
@@ -2335,7 +2335,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
23352335
; CHECK-O3-LABEL: fold_cmp_over_fence:
23362336
; CHECK-O3: # %bb.0:
23372337
; CHECK-O3-NEXT: movl (%rdi), %eax
2338-
; CHECK-O3-NEXT: mfence
2338+
; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
23392339
; CHECK-O3-NEXT: cmpl %eax, %esi
23402340
; CHECK-O3-NEXT: jne .LBB116_2
23412341
; CHECK-O3-NEXT: # %bb.1: # %taken

0 commit comments

Comments
 (0)