Skip to content

Commit 885409a

Browse files
committed
[CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds
1 parent 9570974 commit 885409a

File tree

7 files changed

+343
-6
lines changed

7 files changed

+343
-6
lines changed

llvm/include/llvm/CGData/CodeGenData.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
164164
CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
165165
}
166166

167+
/// Initialize the two-codegen rounds.
168+
void initializeTwoCodegenRounds();
169+
170+
/// Save the current module before the first codegen round.
171+
void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
172+
173+
/// Load the current module before the second codegen round.
174+
std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
175+
unsigned Task,
176+
LLVMContext &Context);
177+
178+
/// Merge the codegen data from the input files in scratch vector in ThinLTO
179+
/// two-codegen rounds.
180+
Error mergeCodeGenData(
181+
const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles);
182+
167183
void warn(Error E, StringRef Whence = "");
168184
void warn(Twine Message, std::string Whence = "", std::string Hint = "");
169185

llvm/lib/CGData/CodeGenData.cpp

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "llvm/Object/ObjectFile.h"
1818
#include "llvm/Support/CommandLine.h"
1919
#include "llvm/Support/FileSystem.h"
20+
#include "llvm/Support/Path.h"
2021
#include "llvm/Support/WithColor.h"
2122

2223
#define DEBUG_TYPE "cg-data"
@@ -30,6 +31,14 @@ cl::opt<bool>
3031
cl::opt<std::string>
3132
CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
3233
cl::desc("File path to where .cgdata file is read"));
34+
cl::opt<bool> CodeGenDataThinLTOTwoRounds(
35+
"codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden,
36+
cl::desc("Enable two-round ThinLTO code generation. The first round "
37+
"generates code and emits CodeGen data, while the second round "
38+
"uses the emitted data for further optimizations."));
39+
40+
// Path to where the optimized bitcodes are saved and restored for ThinLTO.
41+
static SmallString<128> CodeGenDataThinLTOTwoRoundsPath;
3342

3443
static std::string getCGDataErrString(cgdata_error Err,
3544
const std::string &ErrMsg = "") {
@@ -139,7 +148,7 @@ CodeGenData &CodeGenData::getInstance() {
139148
std::call_once(CodeGenData::OnceFlag, []() {
140149
Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
141150

142-
if (CodeGenDataGenerate)
151+
if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds)
143152
Instance->EmitCGData = true;
144153
else if (!CodeGenDataUsePath.empty()) {
145154
// Initialize the global CGData if the input file name is given.
@@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) {
215224
}
216225
}
217226

227+
static std::string getPath(StringRef Dir, unsigned Task) {
228+
return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str();
229+
}
230+
231+
void initializeTwoCodegenRounds() {
232+
assert(CodeGenDataThinLTOTwoRounds);
233+
if (auto EC = llvm::sys::fs::createUniqueDirectory(
234+
"cgdata", CodeGenDataThinLTOTwoRoundsPath))
235+
report_fatal_error(Twine("Failed to create directory: ") + EC.message());
236+
}
237+
238+
void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
239+
assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
240+
std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
241+
std::error_code EC;
242+
raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
243+
if (EC)
244+
report_fatal_error(Twine("Failed to open ") + Path +
245+
" to save optimized bitcode: " + EC.message());
246+
WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
247+
}
248+
249+
std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
250+
unsigned Task,
251+
LLVMContext &Context) {
252+
assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
253+
std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
254+
auto FileOrError = MemoryBuffer::getFile(Path);
255+
if (auto EC = FileOrError.getError())
256+
report_fatal_error(Twine("Failed to open ") + Path +
257+
" to load optimized bitcode: " + EC.message());
258+
259+
std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
260+
auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context);
261+
if (!RestoredModule)
262+
report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
263+
Path + "\n");
264+
265+
// Restore the original module identifier.
266+
(*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier());
267+
return std::move(*RestoredModule);
268+
}
269+
270+
Error mergeCodeGenData(
271+
const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) {
272+
273+
OutlinedHashTreeRecord GlobalOutlineRecord;
274+
for (auto &InputFile : *(InputFiles)) {
275+
if (InputFile.empty())
276+
continue;
277+
StringRef File = StringRef(InputFile.data(), InputFile.size());
278+
std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
279+
File, "in-memory object file", /*RequiresNullTerminator=*/false);
280+
Expected<std::unique_ptr<object::ObjectFile>> BinOrErr =
281+
object::ObjectFile::createObjectFile(Buffer->getMemBufferRef());
282+
if (!BinOrErr)
283+
return BinOrErr.takeError();
284+
285+
std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
286+
if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(),
287+
GlobalOutlineRecord))
288+
return E;
289+
}
290+
291+
if (!GlobalOutlineRecord.empty())
292+
cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
293+
294+
return Error::success();
295+
}
296+
218297
} // end namespace cgdata
219298

220299
} // end namespace llvm

llvm/lib/LTO/LTO.cpp

Lines changed: 98 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "llvm/Analysis/TargetTransformInfo.h"
2222
#include "llvm/Bitcode/BitcodeReader.h"
2323
#include "llvm/Bitcode/BitcodeWriter.h"
24+
#include "llvm/CGData/CodeGenData.h"
2425
#include "llvm/CodeGen/Analysis.h"
2526
#include "llvm/Config/llvm-config.h"
2627
#include "llvm/IR/AutoUpgrade.h"
@@ -71,6 +72,8 @@ static cl::opt<bool>
7172
DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
7273
cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
7374

75+
extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
76+
7477
namespace llvm {
7578
/// Enable global value internalization in LTO.
7679
cl::opt<bool> EnableLTOInternalization(
@@ -1459,7 +1462,7 @@ class InProcessThinBackend : public ThinBackendProc {
14591462
GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
14601463
}
14611464

1462-
Error runThinLTOBackendThread(
1465+
virtual Error runThinLTOBackendThread(
14631466
AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
14641467
ModuleSummaryIndex &CombinedIndex,
14651468
const FunctionImporter::ImportMapTy &ImportList,
@@ -1560,6 +1563,60 @@ class InProcessThinBackend : public ThinBackendProc {
15601563
return BackendThreadPool.getMaxConcurrency();
15611564
}
15621565
};
1566+
1567+
/// This Backend will run ThinBackend process but throw away all the output from
1568+
/// the codegen. This class facilitates the first codegen round.
1569+
class NoOutputThinBackend : public InProcessThinBackend {
1570+
public:
1571+
NoOutputThinBackend(
1572+
const Config &Conf, ModuleSummaryIndex &CombinedIndex,
1573+
ThreadPoolStrategy ThinLTOParallelism,
1574+
const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
1575+
std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch)
1576+
: InProcessThinBackend(
1577+
Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries,
1578+
// Allocate a scratch buffer for each task to write output to.
1579+
[Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) {
1580+
return std::make_unique<CachedFileStream>(
1581+
std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
1582+
},
1583+
FileCache(), nullptr, false, false),
1584+
Scratch(std::move(Scratch)) {}
1585+
1586+
/// Scratch space for writing output during the codegen.
1587+
std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
1588+
};
1589+
1590+
/// This Backend performs codegen on bitcode that was previously saved after
1591+
/// going through optimization. This class facilitates the second codegen round.
1592+
class OptimizedBitcodeThinBackend : public InProcessThinBackend {
1593+
public:
1594+
OptimizedBitcodeThinBackend(
1595+
const Config &Conf, ModuleSummaryIndex &CombinedIndex,
1596+
ThreadPoolStrategy ThinLTOParallelism,
1597+
const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
1598+
AddStreamFn AddStream)
1599+
: InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
1600+
ModuleToDefinedGVSummaries, AddStream, FileCache(),
1601+
nullptr, false, false) {}
1602+
1603+
virtual Error runThinLTOBackendThread(
1604+
AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
1605+
ModuleSummaryIndex &CombinedIndex,
1606+
const FunctionImporter::ImportMapTy &ImportList,
1607+
const FunctionImporter::ExportSetTy &ExportList,
1608+
const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
1609+
const GVSummaryMapTy &DefinedGlobals,
1610+
MapVector<StringRef, BitcodeModule> &ModuleMap) override {
1611+
LTOLLVMContext BackendContext(Conf);
1612+
std::unique_ptr<Module> LoadedModule =
1613+
cgdata::loadModuleForTwoRounds(BM, Task, BackendContext);
1614+
1615+
return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
1616+
ImportList, DefinedGlobals, &ModuleMap,
1617+
/*CodeGenOnly=*/true);
1618+
}
1619+
};
15631620
} // end anonymous namespace
15641621

15651622
ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
@@ -1880,10 +1937,46 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
18801937
return BackendProcess->wait();
18811938
};
18821939

1883-
std::unique_ptr<ThinBackendProc> BackendProc =
1884-
ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
1885-
AddStream, Cache);
1886-
return RunBackends(BackendProc.get());
1940+
if (!CodeGenDataThinLTOTwoRounds) {
1941+
std::unique_ptr<ThinBackendProc> BackendProc =
1942+
ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
1943+
AddStream, Cache);
1944+
return RunBackends(BackendProc.get());
1945+
}
1946+
1947+
// Perform two rounds of code generation for ThinLTO:
1948+
// 1. First round: Run optimization and code generation with a scratch output.
1949+
// 2. Merge codegen data extracted from the scratch output.
1950+
// 3. Second round: Run code generation again using the merged data.
1951+
LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n");
1952+
1953+
// Initialize a temporary path to store and retrieve optimized IRs for
1954+
// two-round code generation.
1955+
cgdata::initializeTwoCodegenRounds();
1956+
1957+
// Create a scratch output to hold intermediate results.
1958+
auto Outputs =
1959+
std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks());
1960+
auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>(
1961+
Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
1962+
ModuleToDefinedGVSummaries, std::move(Outputs));
1963+
// First round: Run optimization and code generation with a scratch output.
1964+
// Before code generation, serialize modules.
1965+
if (Error E = RunBackends(FirstRoundLTO.get()))
1966+
return E;
1967+
1968+
// Merge codegen data extracted from the scratch output.
1969+
if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch)))
1970+
return E;
1971+
1972+
// Second round: Run code generation by reading IRs.
1973+
std::unique_ptr<ThinBackendProc> SecondRoundLTO =
1974+
std::make_unique<OptimizedBitcodeThinBackend>(
1975+
Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
1976+
ModuleToDefinedGVSummaries, AddStream);
1977+
Error E = RunBackends(SecondRoundLTO.get());
1978+
1979+
return E;
18871980
}
18881981

18891982
Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(

llvm/lib/LTO/LTOBackend.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "llvm/Analysis/TargetLibraryInfo.h"
2121
#include "llvm/Bitcode/BitcodeReader.h"
2222
#include "llvm/Bitcode/BitcodeWriter.h"
23+
#include "llvm/CGData/CodeGenData.h"
2324
#include "llvm/IR/LLVMRemarkStreamer.h"
2425
#include "llvm/IR/LegacyPassManager.h"
2526
#include "llvm/IR/PassManager.h"
@@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged(
7475
cl::desc("Assume the input has already undergone ThinLTO function "
7576
"importing and the other pre-optimization pipeline changes."));
7677

78+
extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
79+
7780
namespace llvm {
7881
extern cl::opt<bool> NoPGOWarnMismatch;
7982
}
@@ -599,11 +602,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
599602
auto OptimizeAndCodegen =
600603
[&](Module &Mod, TargetMachine *TM,
601604
std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) {
605+
// Perform optimization and code generation for ThinLTO.
602606
if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
603607
/*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
604608
CmdArgs))
605609
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
606610

611+
// Save the current module before the first codegen round.
612+
// Note that the second codegen round runs only `codegen()` without
613+
// running `opt()`. We're not reaching here as it's bailed out earlier
614+
// with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`.
615+
if (CodeGenDataThinLTOTwoRounds)
616+
cgdata::saveModuleForTwoRounds(Mod, Task);
617+
607618
codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
608619
return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
609620
};
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
2+
; using codegen data that has been read from a previous codegen run.
3+
4+
; RUN: split-file %s %t
5+
6+
; First, we generate the cgdata file from a local outline instance present in local-two.ll.
7+
; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
8+
; RUN: llvm-cgdata --merge %t_write -o %t_cgdata
9+
; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
10+
11+
; SHOW: Outlined hash tree:
12+
; SHOW-NEXT: Total Node Count: 4
13+
; SHOW-NEXT: Terminal Node Count: 1
14+
; SHOW-NEXT: Depth: 3
15+
16+
; Now, we read the cgdata in the machine outliner, enabling us to optimistically
17+
; outline a singleton instance in local-one.ll that matches against the cgdata.
18+
; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one.ll -o %t_read
19+
; RUN: llvm-objdump -d %t_read | FileCheck %s
20+
21+
; CHECK: _OUTLINED_FUNCTION
22+
; CHECK-NEXT: mov
23+
; CHECK-NEXT: mov
24+
; CHECK-NEXT: b
25+
26+
;--- local-two.ll
27+
declare i32 @g(i32, i32, i32)
28+
define i32 @f1() minsize {
29+
%1 = call i32 @g(i32 10, i32 1, i32 2);
30+
ret i32 %1
31+
}
32+
define i32 @f2() minsize {
33+
%1 = call i32 @g(i32 20, i32 1, i32 2);
34+
ret i32 %1
35+
}
36+
37+
;--- local-one.ll
38+
declare i32 @g(i32, i32, i32)
39+
define i32 @f3() minsize {
40+
%1 = call i32 @g(i32 30, i32 1, i32 2);
41+
ret i32 %1
42+
}

0 commit comments

Comments
 (0)