Skip to content

Commit d2192a8

Browse files
xur-llvmyuxuanchen1997
authored andcommitted
[PGO] Sampled instrumentation in PGO to speed up instrumentation binary (#69535)
Summary: In comparison to non-instrumented binaries, PGO instrumentation binaries can be significantly slower. For highly threaded programs, this slowdown can reach 10x due to data races or false sharing within counters. This patch incorporates sampling into the PGO instrumentation process to enhance the speed of instrumentation binaries. The fundamental concept is similar to the one proposed in https://reviews.llvm.org/D63949. Three sampling modes are introduced: 1. Simple Sampling: When '-sampled-instr-bust-duration' is set to 1. 2. Fast Burst Sampling: When not using simple sampling, and '-sampled-instr-period' is set to 65535. This is the default mode of sampling. 3. Full Burst Sampling: When neither simple nor fast burst sampling is used. Utilizing this sampled instrumentation significantly improves the binary's execution speed. Measurements show up to 5x speedup with default settings. Fast burst sampling now results in only around 20% to 30% slowdown (compared to 8 to 10x slowdown without sampling). Out tests show that profile quality remains good with sampling, with edge counts typically showing more than 90% overlap. For applications whose behavior changes due to binary speed, sampling instrumentation can enhance performance. Observations have shown some apps experiencing up to a ~2% improvement in PGO. A potential drawback of this patch is the increased binary size and compilation time. The Sampling method in this patch does not improve single threaded program instrumentation binary speed. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251137
1 parent 9f80cce commit d2192a8

File tree

13 files changed

+716
-25
lines changed

13 files changed

+716
-25
lines changed

llvm/include/llvm/ProfileData/InstrProfData.inc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
739739
#define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
740740
#define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
741741
#define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
742+
#define INSTR_PROF_PROFILE_SAMPLING_VAR __llvm_profile_sampling
742743

743744
/* The variable that holds the name of the profile data
744745
* specified via command line. */

llvm/include/llvm/Transforms/Instrumentation.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,18 @@ struct InstrProfOptions {
121121
// Use BFI to guide register promotion
122122
bool UseBFIInPromotion = false;
123123

124+
// Use sampling to reduce the profile instrumentation runtime overhead.
125+
bool Sampling = false;
126+
124127
// Name of the profile file to use as output
125128
std::string InstrProfileOutput;
126129

127130
InstrProfOptions() = default;
128131
};
129132

133+
// Create the variable for profile sampling.
134+
void createProfileSamplingVar(Module &M);
135+
130136
// Options for sanitizer coverage instrumentation.
131137
struct SanitizerCoverageOptions {
132138
enum Type {

llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,14 @@ class FileSystem;
4343
class PGOInstrumentationGenCreateVar
4444
: public PassInfoMixin<PGOInstrumentationGenCreateVar> {
4545
public:
46-
PGOInstrumentationGenCreateVar(std::string CSInstrName = "")
47-
: CSInstrName(CSInstrName) {}
46+
PGOInstrumentationGenCreateVar(std::string CSInstrName = "",
47+
bool Sampling = false)
48+
: CSInstrName(CSInstrName), ProfileSampling(Sampling) {}
4849
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
4950

5051
private:
5152
std::string CSInstrName;
53+
bool ProfileSampling;
5254
};
5355

5456
/// The instrumentation (profile-instr-gen) pass for IR based PGO.

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,9 @@ static cl::opt<AttributorRunOption> AttributorRun(
296296
clEnumValN(AttributorRunOption::NONE, "none",
297297
"disable attributor runs")));
298298

299+
static cl::opt<bool> EnableSampledInstr(
300+
"enable-sampled-instrumentation", cl::init(false), cl::Hidden,
301+
cl::desc("Enable profile instrumentation sampling (default = off)"));
299302
static cl::opt<bool> UseLoopVersioningLICM(
300303
"enable-loop-versioning-licm", cl::init(false), cl::Hidden,
301304
cl::desc("Enable the experimental Loop Versioning LICM pass"));
@@ -847,6 +850,12 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
847850
// Do counter promotion at Level greater than O0.
848851
Options.DoCounterPromotion = true;
849852
Options.UseBFIInPromotion = IsCS;
853+
if (EnableSampledInstr) {
854+
Options.Sampling = true;
855+
// With sampling, there is little beneifit to enable counter promotion.
856+
// But note that sampling does work with counter promotion.
857+
Options.DoCounterPromotion = false;
858+
}
850859
Options.Atomic = AtomicCounterUpdate;
851860
MPM.addPass(InstrProfilingLoweringPass(Options, IsCS));
852861
}
@@ -1185,7 +1194,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
11851194
MPM.addPass(PGOIndirectCallPromotion(false, false));
11861195

11871196
if (IsPGOPreLink && PGOOpt->CSAction == PGOOptions::CSIRInstr)
1188-
MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile));
1197+
MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile,
1198+
EnableSampledInstr));
11891199

11901200
if (IsMemprofUse)
11911201
MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS));

llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp

Lines changed: 223 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,30 @@ cl::opt<bool> SkipRetExitBlock(
170170
"skip-ret-exit-block", cl::init(true),
171171
cl::desc("Suppress counter promotion if exit blocks contain ret."));
172172

173+
static cl::opt<bool> SampledInstr("sampled-instrumentation", cl::ZeroOrMore,
174+
cl::init(false),
175+
cl::desc("Do PGO instrumentation sampling"));
176+
177+
static cl::opt<unsigned> SampledInstrPeriod(
178+
"sampled-instr-period",
179+
cl::desc("Set the profile instrumentation sample period. For each sample "
180+
"period, a fixed number of consecutive samples will be recorded. "
181+
"The number is controlled by 'sampled-instr-burst-duration' flag. "
182+
"The default sample period of 65535 is optimized for generating "
183+
"efficient code that leverages unsigned integer wrapping in "
184+
"overflow."),
185+
cl::init(65535));
186+
187+
static cl::opt<unsigned> SampledInstrBurstDuration(
188+
"sampled-instr-burst-duration",
189+
cl::desc("Set the profile instrumentation burst duration, which can range "
190+
"from 0 to one less than the value of 'sampled-instr-period'. "
191+
"This number of samples will be recorded for each "
192+
"'sampled-instr-period' count update. Setting to 1 enables "
193+
"simple sampling, in which case it is recommended to set "
194+
"'sampled-instr-period' to a prime number."),
195+
cl::init(200));
196+
173197
using LoadStorePair = std::pair<Instruction *, Instruction *>;
174198

175199
static uint64_t getIntModuleFlagOrZero(const Module &M, StringRef Flag) {
@@ -260,6 +284,9 @@ class InstrLowerer final {
260284
/// Returns true if profile counter update register promotion is enabled.
261285
bool isCounterPromotionEnabled() const;
262286

287+
/// Return true if profile sampling is enabled.
288+
bool isSamplingEnabled() const;
289+
263290
/// Count the number of instrumented value sites for the function.
264291
void computeNumValueSiteCounts(InstrProfValueProfileInst *Ins);
265292

@@ -291,6 +318,9 @@ class InstrLowerer final {
291318
/// acts on.
292319
Value *getCounterAddress(InstrProfCntrInstBase *I);
293320

321+
/// Lower the incremental instructions under profile sampling predicates.
322+
void doSampling(Instruction *I);
323+
294324
/// Get the region counters for an increment, creating them if necessary.
295325
///
296326
/// If the counter array doesn't yet exist, the profile data variables
@@ -635,33 +665,169 @@ PreservedAnalyses InstrProfilingLoweringPass::run(Module &M,
635665
return PreservedAnalyses::none();
636666
}
637667

668+
//
669+
// Perform instrumentation sampling.
670+
//
671+
// There are 3 favors of sampling:
672+
// (1) Full burst sampling: We transform:
673+
// Increment_Instruction;
674+
// to:
675+
// if (__llvm_profile_sampling__ < SampledInstrBurstDuration) {
676+
// Increment_Instruction;
677+
// }
678+
// __llvm_profile_sampling__ += 1;
679+
// if (__llvm_profile_sampling__ >= SampledInstrPeriod) {
680+
// __llvm_profile_sampling__ = 0;
681+
// }
682+
//
683+
// "__llvm_profile_sampling__" is a thread-local global shared by all PGO
684+
// counters (value-instrumentation and edge instrumentation).
685+
//
686+
// (2) Fast burst sampling:
687+
// "__llvm_profile_sampling__" variable is an unsigned type, meaning it will
688+
// wrap around to zero when overflows. In this case, the second check is
689+
// unnecessary, so we won't generate check2 when the SampledInstrPeriod is
690+
// set to 65535 (64K - 1). The code after:
691+
// if (__llvm_profile_sampling__ < SampledInstrBurstDuration) {
692+
// Increment_Instruction;
693+
// }
694+
// __llvm_profile_sampling__ += 1;
695+
//
696+
// (3) Simple sampling:
697+
// When SampledInstrBurstDuration sets to 1, we do a simple sampling:
698+
// __llvm_profile_sampling__ += 1;
699+
// if (__llvm_profile_sampling__ >= SampledInstrPeriod) {
700+
// __llvm_profile_sampling__ = 0;
701+
// Increment_Instruction;
702+
// }
703+
//
704+
// Note that, the code snippet after the transformation can still be counter
705+
// promoted. However, with sampling enabled, counter updates are expected to
706+
// be infrequent, making the benefits of counter promotion negligible.
707+
// Moreover, counter promotion can potentially cause issues in server
708+
// applications, particularly when the counters are dumped without a clean
709+
// exit. To mitigate this risk, counter promotion is disabled by default when
710+
// sampling is enabled. This behavior can be overridden using the internal
711+
// option.
712+
void InstrLowerer::doSampling(Instruction *I) {
713+
if (!isSamplingEnabled())
714+
return;
715+
716+
unsigned SampledBurstDuration = SampledInstrBurstDuration.getValue();
717+
unsigned SampledPeriod = SampledInstrPeriod.getValue();
718+
if (SampledBurstDuration >= SampledPeriod) {
719+
report_fatal_error(
720+
"SampledPeriod needs to be greater than SampledBurstDuration");
721+
}
722+
bool UseShort = (SampledPeriod <= USHRT_MAX);
723+
bool IsSimpleSampling = (SampledBurstDuration == 1);
724+
// If (SampledBurstDuration == 1 && SampledPeriod == 65535), generate
725+
// the simple sampling style code.
726+
bool IsFastSampling = (!IsSimpleSampling && SampledPeriod == 65535);
727+
728+
auto GetConstant = [UseShort](IRBuilder<> &Builder, uint32_t C) {
729+
if (UseShort)
730+
return Builder.getInt16(C);
731+
else
732+
return Builder.getInt32(C);
733+
};
734+
735+
IntegerType *SamplingVarTy;
736+
if (UseShort)
737+
SamplingVarTy = Type::getInt16Ty(M.getContext());
738+
else
739+
SamplingVarTy = Type::getInt32Ty(M.getContext());
740+
auto *SamplingVar =
741+
M.getGlobalVariable(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
742+
assert(SamplingVar && "SamplingVar not set properly");
743+
744+
// Create the condition for checking the burst duration.
745+
Instruction *SamplingVarIncr;
746+
Value *NewSamplingVarVal;
747+
MDBuilder MDB(I->getContext());
748+
MDNode *BranchWeight;
749+
IRBuilder<> CondBuilder(I);
750+
auto *LoadSamplingVar = CondBuilder.CreateLoad(SamplingVarTy, SamplingVar);
751+
if (IsSimpleSampling) {
752+
// For the simple sampling, just create the load and increments.
753+
IRBuilder<> IncBuilder(I);
754+
NewSamplingVarVal =
755+
IncBuilder.CreateAdd(LoadSamplingVar, GetConstant(IncBuilder, 1));
756+
SamplingVarIncr = IncBuilder.CreateStore(NewSamplingVarVal, SamplingVar);
757+
} else {
758+
// For the bust-sampling, create the conditonal update.
759+
auto *DurationCond = CondBuilder.CreateICmpULE(
760+
LoadSamplingVar, GetConstant(CondBuilder, SampledBurstDuration));
761+
BranchWeight = MDB.createBranchWeights(
762+
SampledBurstDuration, SampledPeriod + 1 - SampledBurstDuration);
763+
Instruction *ThenTerm = SplitBlockAndInsertIfThen(
764+
DurationCond, I, /* Unreachable */ false, BranchWeight);
765+
IRBuilder<> IncBuilder(I);
766+
NewSamplingVarVal =
767+
IncBuilder.CreateAdd(LoadSamplingVar, GetConstant(IncBuilder, 1));
768+
SamplingVarIncr = IncBuilder.CreateStore(NewSamplingVarVal, SamplingVar);
769+
I->moveBefore(ThenTerm);
770+
}
771+
772+
if (IsFastSampling)
773+
return;
774+
775+
// Create the condtion for checking the period.
776+
Instruction *ThenTerm, *ElseTerm;
777+
IRBuilder<> PeriodCondBuilder(SamplingVarIncr);
778+
auto *PeriodCond = PeriodCondBuilder.CreateICmpUGE(
779+
NewSamplingVarVal, GetConstant(PeriodCondBuilder, SampledPeriod));
780+
BranchWeight = MDB.createBranchWeights(1, SampledPeriod);
781+
SplitBlockAndInsertIfThenElse(PeriodCond, SamplingVarIncr, &ThenTerm,
782+
&ElseTerm, BranchWeight);
783+
784+
// For the simple sampling, the counter update happens in sampling var reset.
785+
if (IsSimpleSampling)
786+
I->moveBefore(ThenTerm);
787+
788+
IRBuilder<> ResetBuilder(ThenTerm);
789+
ResetBuilder.CreateStore(GetConstant(ResetBuilder, 0), SamplingVar);
790+
SamplingVarIncr->moveBefore(ElseTerm);
791+
}
792+
638793
bool InstrLowerer::lowerIntrinsics(Function *F) {
639794
bool MadeChange = false;
640795
PromotionCandidates.clear();
796+
SmallVector<InstrProfInstBase *, 8> InstrProfInsts;
797+
798+
// To ensure compatibility with sampling, we save the intrinsics into
799+
// a buffer to prevent potential breakage of the iterator (as the
800+
// intrinsics will be moved to a different BB).
641801
for (BasicBlock &BB : *F) {
642802
for (Instruction &Instr : llvm::make_early_inc_range(BB)) {
643-
if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(&Instr)) {
644-
lowerIncrement(IPIS);
645-
MadeChange = true;
646-
} else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(&Instr)) {
647-
lowerIncrement(IPI);
648-
MadeChange = true;
649-
} else if (auto *IPC = dyn_cast<InstrProfTimestampInst>(&Instr)) {
650-
lowerTimestamp(IPC);
651-
MadeChange = true;
652-
} else if (auto *IPC = dyn_cast<InstrProfCoverInst>(&Instr)) {
653-
lowerCover(IPC);
654-
MadeChange = true;
655-
} else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
656-
lowerValueProfileInst(IPVP);
657-
MadeChange = true;
658-
} else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(&Instr)) {
659-
IPMP->eraseFromParent();
660-
MadeChange = true;
661-
} else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(&Instr)) {
662-
lowerMCDCTestVectorBitmapUpdate(IPBU);
663-
MadeChange = true;
664-
}
803+
if (auto *IP = dyn_cast<InstrProfInstBase>(&Instr))
804+
InstrProfInsts.push_back(IP);
805+
}
806+
}
807+
808+
for (auto *Instr : InstrProfInsts) {
809+
doSampling(Instr);
810+
if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(Instr)) {
811+
lowerIncrement(IPIS);
812+
MadeChange = true;
813+
} else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(Instr)) {
814+
lowerIncrement(IPI);
815+
MadeChange = true;
816+
} else if (auto *IPC = dyn_cast<InstrProfTimestampInst>(Instr)) {
817+
lowerTimestamp(IPC);
818+
MadeChange = true;
819+
} else if (auto *IPC = dyn_cast<InstrProfCoverInst>(Instr)) {
820+
lowerCover(IPC);
821+
MadeChange = true;
822+
} else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(Instr)) {
823+
lowerValueProfileInst(IPVP);
824+
MadeChange = true;
825+
} else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(Instr)) {
826+
IPMP->eraseFromParent();
827+
MadeChange = true;
828+
} else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(Instr)) {
829+
lowerMCDCTestVectorBitmapUpdate(IPBU);
830+
MadeChange = true;
665831
}
666832
}
667833

@@ -684,6 +850,12 @@ bool InstrLowerer::isRuntimeCounterRelocationEnabled() const {
684850
return TT.isOSFuchsia();
685851
}
686852

853+
bool InstrLowerer::isSamplingEnabled() const {
854+
if (SampledInstr.getNumOccurrences() > 0)
855+
return SampledInstr;
856+
return Options.Sampling;
857+
}
858+
687859
bool InstrLowerer::isCounterPromotionEnabled() const {
688860
if (DoCounterPromotion.getNumOccurrences() > 0)
689861
return DoCounterPromotion;
@@ -754,6 +926,9 @@ bool InstrLowerer::lower() {
754926
if (NeedsRuntimeHook)
755927
MadeChange = emitRuntimeHook();
756928

929+
if (!IsCS && isSamplingEnabled())
930+
createProfileSamplingVar(M);
931+
757932
bool ContainsProfiling = containsProfilingIntrinsics(M);
758933
GlobalVariable *CoverageNamesVar =
759934
M.getNamedGlobal(getCoverageUnusedNamesVarName());
@@ -1955,3 +2130,29 @@ void InstrLowerer::emitInitialization() {
19552130

19562131
appendToGlobalCtors(M, F, 0);
19572132
}
2133+
2134+
namespace llvm {
2135+
// Create the variable for profile sampling.
2136+
void createProfileSamplingVar(Module &M) {
2137+
const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
2138+
IntegerType *SamplingVarTy;
2139+
Constant *ValueZero;
2140+
if (SampledInstrPeriod.getValue() <= USHRT_MAX) {
2141+
SamplingVarTy = Type::getInt16Ty(M.getContext());
2142+
ValueZero = Constant::getIntegerValue(SamplingVarTy, APInt(16, 0));
2143+
} else {
2144+
SamplingVarTy = Type::getInt32Ty(M.getContext());
2145+
ValueZero = Constant::getIntegerValue(SamplingVarTy, APInt(32, 0));
2146+
}
2147+
auto SamplingVar = new GlobalVariable(
2148+
M, SamplingVarTy, false, GlobalValue::WeakAnyLinkage, ValueZero, VarName);
2149+
SamplingVar->setVisibility(GlobalValue::DefaultVisibility);
2150+
SamplingVar->setThreadLocal(true);
2151+
Triple TT(M.getTargetTriple());
2152+
if (TT.supportsCOMDAT()) {
2153+
SamplingVar->setLinkage(GlobalValue::ExternalLinkage);
2154+
SamplingVar->setComdat(M.getOrInsertComdat(VarName));
2155+
}
2156+
appendToCompilerUsed(M, SamplingVar);
2157+
}
2158+
} // namespace llvm

llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1875,6 +1875,8 @@ PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &MAM) {
18751875
// The variable in a comdat may be discarded by LTO. Ensure the declaration
18761876
// will be retained.
18771877
appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true));
1878+
if (ProfileSampling)
1879+
createProfileSamplingVar(M);
18781880
PreservedAnalyses PA;
18791881
PA.preserve<FunctionAnalysisManagerModuleProxy>();
18801882
PA.preserveSet<AllAnalysesOn<Function>>();

0 commit comments

Comments
 (0)