Skip to content

Commit b5f6005

Browse files
committed
[X86] Prefer lock or over mfence
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction.
1 parent f83eeac commit b5f6005

File tree

4 files changed

+815
-38
lines changed

4 files changed

+815
-38
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,10 @@ def TuningUseGLMDivSqrtCosts
772772
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
773773
"Target has branch hint feature">;
774774

775+
def TuningAvoidMFENCE
776+
: SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
777+
"Avoid MFENCE for fence seq_cst, and instead use lock or">;
778+
775779
//===----------------------------------------------------------------------===//
776780
// X86 CPU Families
777781
// TODO: Remove these - use general tuning features to determine codegen.
@@ -833,7 +837,8 @@ def ProcessorFeatures {
833837
TuningSlow3OpsLEA,
834838
TuningSlowDivide64,
835839
TuningSlowIncDec,
836-
TuningInsertVZEROUPPER
840+
TuningInsertVZEROUPPER,
841+
TuningAvoidMFENCE
837842
];
838843

839844
list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
@@ -849,7 +854,8 @@ def ProcessorFeatures {
849854
TuningFastSHLDRotate,
850855
TuningFast15ByteNOP,
851856
TuningPOPCNTFalseDeps,
852-
TuningInsertVZEROUPPER
857+
TuningInsertVZEROUPPER,
858+
TuningAvoidMFENCE
853859
];
854860

855861
list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
@@ -868,7 +874,8 @@ def ProcessorFeatures {
868874
TuningPOPCNTFalseDeps,
869875
TuningLZCNTFalseDeps,
870876
TuningInsertVZEROUPPER,
871-
TuningAllowLight256Bit
877+
TuningAllowLight256Bit,
878+
TuningAvoidMFENCE
872879
];
873880

874881
list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
@@ -892,15 +899,17 @@ def ProcessorFeatures {
892899
TuningFastGather,
893900
TuningPOPCNTFalseDeps,
894901
TuningInsertVZEROUPPER,
895-
TuningAllowLight256Bit
902+
TuningAllowLight256Bit,
903+
TuningAvoidMFENCE
896904
];
897905

898906
// Nehalem
899907
list<SubtargetFeature> NHMFeatures = X86_64V2Features;
900908
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
901909
TuningSlowDivide64,
902910
TuningInsertVZEROUPPER,
903-
TuningNoDomainDelayMov];
911+
TuningNoDomainDelayMov,
912+
TuningAvoidMFENCE];
904913

905914
// Westmere
906915
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -921,7 +930,8 @@ def ProcessorFeatures {
921930
TuningFast15ByteNOP,
922931
TuningPOPCNTFalseDeps,
923932
TuningInsertVZEROUPPER,
924-
TuningNoDomainDelayMov];
933+
TuningNoDomainDelayMov,
934+
TuningAvoidMFENCE];
925935
list<SubtargetFeature> SNBFeatures =
926936
!listconcat(WSMFeatures, SNBAdditionalFeatures);
927937

@@ -987,7 +997,8 @@ def ProcessorFeatures {
987997
TuningAllowLight256Bit,
988998
TuningNoDomainDelayMov,
989999
TuningNoDomainDelayShuffle,
990-
TuningNoDomainDelayBlend];
1000+
TuningNoDomainDelayBlend,
1001+
TuningAvoidMFENCE];
9911002
list<SubtargetFeature> SKLFeatures =
9921003
!listconcat(BDWFeatures, SKLAdditionalFeatures);
9931004

@@ -1022,7 +1033,8 @@ def ProcessorFeatures {
10221033
TuningNoDomainDelayMov,
10231034
TuningNoDomainDelayShuffle,
10241035
TuningNoDomainDelayBlend,
1025-
TuningFastImmVectorShift];
1036+
TuningFastImmVectorShift,
1037+
TuningAvoidMFENCE];
10261038
list<SubtargetFeature> SKXFeatures =
10271039
!listconcat(BDWFeatures, SKXAdditionalFeatures);
10281040

@@ -1065,7 +1077,8 @@ def ProcessorFeatures {
10651077
TuningNoDomainDelayMov,
10661078
TuningNoDomainDelayShuffle,
10671079
TuningNoDomainDelayBlend,
1068-
TuningFastImmVectorShift];
1080+
TuningFastImmVectorShift,
1081+
TuningAvoidMFENCE];
10691082
list<SubtargetFeature> CNLFeatures =
10701083
!listconcat(SKLFeatures, CNLAdditionalFeatures);
10711084

@@ -1094,7 +1107,8 @@ def ProcessorFeatures {
10941107
TuningNoDomainDelayMov,
10951108
TuningNoDomainDelayShuffle,
10961109
TuningNoDomainDelayBlend,
1097-
TuningFastImmVectorShift];
1110+
TuningFastImmVectorShift,
1111+
TuningAvoidMFENCE];
10981112
list<SubtargetFeature> ICLFeatures =
10991113
!listconcat(CNLFeatures, ICLAdditionalFeatures);
11001114

@@ -1268,7 +1282,8 @@ def ProcessorFeatures {
12681282
// Tremont
12691283
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
12701284
FeatureGFNI];
1271-
list<SubtargetFeature> TRMTuning = GLPTuning;
1285+
list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
1286+
list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
12721287
list<SubtargetFeature> TRMFeatures =
12731288
!listconcat(GLPFeatures, TRMAdditionalFeatures);
12741289

@@ -1446,7 +1461,8 @@ def ProcessorFeatures {
14461461
TuningFastImm16,
14471462
TuningSBBDepBreaking,
14481463
TuningSlowDivide64,
1449-
TuningSlowSHLD];
1464+
TuningSlowSHLD,
1465+
TuningAvoidMFENCE];
14501466
list<SubtargetFeature> BtVer2Features =
14511467
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
14521468

@@ -1475,7 +1491,8 @@ def ProcessorFeatures {
14751491
TuningFastScalarShiftMasks,
14761492
TuningBranchFusion,
14771493
TuningSBBDepBreaking,
1478-
TuningInsertVZEROUPPER];
1494+
TuningInsertVZEROUPPER,
1495+
TuningAvoidMFENCE];
14791496

14801497
// PileDriver
14811498
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1555,7 +1572,8 @@ def ProcessorFeatures {
15551572
TuningSlowSHLD,
15561573
TuningSBBDepBreaking,
15571574
TuningInsertVZEROUPPER,
1558-
TuningAllowLight256Bit];
1575+
TuningAllowLight256Bit,
1576+
TuningAvoidMFENCE];
15591577
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
15601578
FeatureRDPID,
15611579
FeatureRDPRU,
@@ -1740,7 +1758,8 @@ def : ProcModel<P, SandyBridgeModel, [
17401758
[
17411759
TuningMacroFusion,
17421760
TuningSlowUAMem16,
1743-
TuningInsertVZEROUPPER
1761+
TuningInsertVZEROUPPER,
1762+
TuningAvoidMFENCE
17441763
]>;
17451764
}
17461765
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1759,7 +1778,8 @@ def : ProcModel<P, SandyBridgeModel, [
17591778
[
17601779
TuningMacroFusion,
17611780
TuningSlowUAMem16,
1762-
TuningInsertVZEROUPPER
1781+
TuningInsertVZEROUPPER,
1782+
TuningAvoidMFENCE
17631783
]>;
17641784
}
17651785

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31906,7 +31906,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
3190631906
// especially clever.
3190731907

3190831908
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
31909-
// lowering for SSID == SyncScope::SingleThread and !hasMFence
31909+
// lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
3191031910
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
3191131911

3191231912
// Finally we can emit the atomic load.
@@ -31995,7 +31995,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
3199531995
// cross-thread fence.
3199631996
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
3199731997
FenceSSID == SyncScope::System) {
31998-
if (Subtarget.hasMFence())
31998+
if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
3199931999
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
3200032000

3200132001
SDValue Chain = Op.getOperand(0);

0 commit comments

Comments
 (0)