Skip to content

Commit 8fcc02e

Browse files
committed
merge main into amd-staging
Change-Id: Ie159c5b040f7aa31d849f0b91f18c4216cf1a369
2 parents ead7af7 + ce8e869 commit 8fcc02e

File tree

597 files changed

+14168
-6592
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

597 files changed

+14168
-6592
lines changed

.github/CODEOWNERS

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,14 @@ clang/test/AST/Interp/ @tbaederr
106106
# MLIR Sparsifier.
107107
/mlir/**/*SparseTensor*/ @aartbik @PeimingLiu @yinying-lisa-li @matthias-springer
108108

109+
# MLIR NVGPU Dialect
110+
/mlir/**/NVGPU*/ @grypp
111+
/mlir/test/**/CUDA/ @grypp
112+
113+
# MLIR NVVM Dialect in MLIR
114+
/mlir/**/LLVMIR/**/BasicPtxBuilderInterface* @grypp
115+
/mlir/**/NVVM*/ @grypp
116+
109117
# BOLT
110118
/bolt/ @aaupov @maksfb @rafaelauler @ayermolo @dcci
111119

bolt/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ if (BOLT_ENABLE_RUNTIME)
4545
execute_process(COMMAND ls /proc/self/map_files
4646
RESULT_VARIABLE LS OUTPUT_QUIET ERROR_QUIET)
4747
if (LS)
48-
set(BOLT_ENABLE_RUNTIME OFF)
4948
message(WARNING
50-
"BOLT runtime is disabled as /proc/self/map_files is unreadable.")
49+
"BOLT runtime may not be able to read /proc/self/map_files. Please use
50+
`--instrumentation-binpath <path-to-instrumented-binary>` option.")
5151
endif()
5252
endif()
5353

bolt/docs/BAT.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Hot indices are delta encoded, implicitly starting at zero.
7979
| ------ | ------| ----------- |
8080
| `Address` | Continuous, Delta, ULEB128 | Function address in the output binary |
8181
| `HotIndex` | Delta, ULEB128 | Cold functions only: index of corresponding hot function in hot functions table |
82+
| `FuncHash` | 8b | Hot functions only: function hash for input function |
8283
| `NumEntries` | ULEB128 | Number of address translation entries for a function |
8384
| `EqualElems` | ULEB128 | Hot functions only: number of equal offsets in the beginning of a function |
8485
| `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | Hot functions only: if `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit |
@@ -94,6 +95,7 @@ entry is encoded. Input offsets implicitly start at zero.
9495
| ------ | ------| ----------- |
9596
| `OutputOffset` | Continuous, Delta, ULEB128 | Function offset in output binary |
9697
| `InputOffset` | Optional, Delta, SLEB128 | Function offset in input binary with `BRANCHENTRY` LSB bit |
98+
| `BBHash` | Optional, 8b | Basic block entries only: basic block hash in input binary |
9799

98100
`BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
99101
(branch or call instruction). If not set, it signifies a control flow target

bolt/include/bolt/Profile/BoltAddressTranslation.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,13 @@ class BoltAddressTranslation {
115115
/// Save function and basic block hashes used for metadata dump.
116116
void saveMetadata(BinaryContext &BC);
117117

118+
/// Returns BB hash by function output address (after BOLT) and basic block
119+
/// input offset.
120+
size_t getBBHash(uint64_t FuncOutputAddress, uint32_t BBInputOffset) const;
121+
122+
/// Returns BF hash by function output address (after BOLT).
123+
size_t getBFHash(uint64_t OutputAddress) const;
124+
118125
private:
119126
/// Helper to update \p Map by inserting one or more BAT entries reflecting
120127
/// \p BB for function located at \p FuncAddress. At least one entry will be
@@ -150,6 +157,9 @@ class BoltAddressTranslation {
150157
/// Links outlined cold bocks to their original function
151158
std::map<uint64_t, uint64_t> ColdPartSource;
152159

160+
/// Links output address of a main fragment back to input address.
161+
std::unordered_map<uint64_t, uint64_t> ReverseMap;
162+
153163
/// Identifies the address of a control-flow changing instructions in a
154164
/// translation map entry
155165
const static uint32_t BRANCHENTRY = 0x1;

bolt/include/bolt/Profile/YAMLProfileWriter.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef BOLT_PROFILE_YAML_PROFILE_WRITER_H
1010
#define BOLT_PROFILE_YAML_PROFILE_WRITER_H
1111

12+
#include "bolt/Profile/ProfileYAMLMapping.h"
1213
#include "llvm/Support/raw_ostream.h"
1314
#include <system_error>
1415

@@ -29,6 +30,9 @@ class YAMLProfileWriter {
2930

3031
/// Save execution profile for that instance.
3132
std::error_code writeProfile(const RewriteInstance &RI);
33+
34+
static yaml::bolt::BinaryFunctionProfile convert(const BinaryFunction &BF,
35+
bool UseDFS);
3236
};
3337

3438
} // namespace bolt

bolt/lib/Profile/BoltAddressTranslation.cpp

Lines changed: 78 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ const char *BoltAddressTranslation::SECTION_NAME = ".note.bolt_bat";
2323
void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
2424
const BinaryBasicBlock &BB,
2525
uint64_t FuncAddress) {
26+
uint64_t HotFuncAddress = ColdPartSource.count(FuncAddress)
27+
? ColdPartSource[FuncAddress]
28+
: FuncAddress;
2629
const uint64_t BBOutputOffset =
2730
BB.getOutputAddressRange().first - FuncAddress;
2831
const uint32_t BBInputOffset = BB.getInputOffset();
@@ -39,6 +42,9 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
3942
LLVM_DEBUG(dbgs() << "BB " << BB.getName() << "\n");
4043
LLVM_DEBUG(dbgs() << " Key: " << Twine::utohexstr(BBOutputOffset)
4144
<< " Val: " << Twine::utohexstr(BBInputOffset) << "\n");
45+
LLVM_DEBUG(dbgs() << formatv(" Hash: {0:x}\n",
46+
getBBHash(HotFuncAddress, BBInputOffset)));
47+
(void)HotFuncAddress;
4248
// In case of conflicts (same Key mapping to different Vals), the last
4349
// update takes precedence. Of course it is not ideal to have conflicts and
4450
// those happen when we have an empty BB that either contained only
@@ -72,20 +78,28 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
7278
LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n");
7379
for (auto &BFI : BC.getBinaryFunctions()) {
7480
const BinaryFunction &Function = BFI.second;
81+
const uint64_t InputAddress = Function.getAddress();
82+
const uint64_t OutputAddress = Function.getOutputAddress();
7583
// We don't need a translation table if the body of the function hasn't
7684
// changed
7785
if (Function.isIgnored() || (!BC.HasRelocations && !Function.isSimple()))
7886
continue;
7987

88+
// TBD: handle BAT functions w/multiple entry points.
89+
if (Function.isMultiEntry())
90+
continue;
91+
8092
LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
8193
LLVM_DEBUG(dbgs() << " Address reference: 0x"
8294
<< Twine::utohexstr(Function.getOutputAddress()) << "\n");
95+
LLVM_DEBUG(dbgs() << formatv(" Hash: {0:x}\n", getBFHash(OutputAddress)));
8396

8497
MapTy Map;
8598
for (const BinaryBasicBlock *const BB :
8699
Function.getLayout().getMainFragment())
87100
writeEntriesForBB(Map, *BB, Function.getOutputAddress());
88101
Maps.emplace(Function.getOutputAddress(), std::move(Map));
102+
ReverseMap.emplace(OutputAddress, InputAddress);
89103

90104
if (!Function.isSplit())
91105
continue;
@@ -94,12 +108,12 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
94108
LLVM_DEBUG(dbgs() << " Cold part\n");
95109
for (const FunctionFragment &FF :
96110
Function.getLayout().getSplitFragments()) {
111+
ColdPartSource.emplace(FF.getAddress(), Function.getOutputAddress());
97112
Map.clear();
98113
for (const BinaryBasicBlock *const BB : FF)
99114
writeEntriesForBB(Map, *BB, FF.getAddress());
100115

101116
Maps.emplace(FF.getAddress(), std::move(Map));
102-
ColdPartSource.emplace(FF.getAddress(), Function.getOutputAddress());
103117
}
104118
}
105119

@@ -109,6 +123,11 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
109123
writeMaps</*Cold=*/true>(Maps, PrevAddress, OS);
110124

111125
BC.outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
126+
const uint64_t NumBBHashes = std::accumulate(
127+
FuncHashes.begin(), FuncHashes.end(), 0ull,
128+
[](size_t Acc, const auto &B) { return Acc + B.second.second.size(); });
129+
BC.outs() << "BOLT-INFO: Wrote " << FuncHashes.size() << " function and "
130+
<< NumBBHashes << " basic block hashes\n";
112131
}
113132

114133
APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
@@ -155,6 +174,11 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
155174
// Only process cold fragments in cold mode, and vice versa.
156175
if (Cold != ColdPartSource.count(Address))
157176
continue;
177+
// NB: here we use the input address because hashes are saved early (in
178+
// `saveMetadata`) before output addresses are assigned.
179+
const uint64_t HotInputAddress =
180+
ReverseMap[Cold ? ColdPartSource[Address] : Address];
181+
std::pair<size_t, BBHashMap> &FuncHashPair = FuncHashes[HotInputAddress];
158182
MapTy &Map = MapEntry.second;
159183
const uint32_t NumEntries = Map.size();
160184
LLVM_DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x"
@@ -166,6 +190,10 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
166190
std::distance(ColdPartSource.begin(), ColdPartSource.find(Address));
167191
encodeULEB128(HotIndex - PrevIndex, OS);
168192
PrevIndex = HotIndex;
193+
} else {
194+
// Function hash
195+
LLVM_DEBUG(dbgs() << "Hash: " << formatv("{0:x}\n", FuncHashPair.first));
196+
OS.write(reinterpret_cast<char *>(&FuncHashPair.first), 8);
169197
}
170198
encodeULEB128(NumEntries, OS);
171199
// For hot fragments only: encode the number of equal offsets
@@ -197,6 +225,13 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
197225
if (Index++ >= EqualElems)
198226
encodeSLEB128(KeyVal.second - InOffset, OS);
199227
InOffset = KeyVal.second; // Keeping InOffset as if BRANCHENTRY is encoded
228+
if ((InOffset & BRANCHENTRY) == 0) {
229+
// Basic block hash
230+
size_t BBHash = FuncHashPair.second[InOffset >> 1];
231+
OS.write(reinterpret_cast<char *>(&BBHash), 8);
232+
LLVM_DEBUG(dbgs() << formatv("{0:x} -> {1:x} {2:x}\n", KeyVal.first,
233+
InOffset >> 1, BBHash));
234+
}
200235
}
201236
}
202237
}
@@ -239,12 +274,18 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
239274
size_t HotIndex = 0;
240275
for (uint32_t I = 0; I < NumFunctions; ++I) {
241276
const uint64_t Address = PrevAddress + DE.getULEB128(&Offset, &Err);
277+
uint64_t HotAddress = Cold ? 0 : Address;
242278
PrevAddress = Address;
243279
if (Cold) {
244280
HotIndex += DE.getULEB128(&Offset, &Err);
245-
ColdPartSource.emplace(Address, HotFuncs[HotIndex]);
281+
HotAddress = HotFuncs[HotIndex];
282+
ColdPartSource.emplace(Address, HotAddress);
246283
} else {
247284
HotFuncs.push_back(Address);
285+
// Function hash
286+
const size_t FuncHash = DE.getU64(&Offset, &Err);
287+
FuncHashes[Address].first = FuncHash;
288+
LLVM_DEBUG(dbgs() << formatv("{0:x}: hash {1:x}\n", Address, FuncHash));
248289
}
249290
const uint32_t NumEntries = DE.getULEB128(&Offset, &Err);
250291
// Equal offsets, hot fragments only.
@@ -288,12 +329,22 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
288329
InputOffset += InputDelta;
289330
}
290331
Map.insert(std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset));
291-
LLVM_DEBUG(
292-
dbgs() << formatv("{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b), {6:x}\n",
293-
OutputOffset, InputOffset, OutputDelta,
294-
getULEB128Size(OutputDelta), InputDelta,
295-
(J < EqualElems) ? 0 : getSLEB128Size(InputDelta),
296-
OutputAddress));
332+
size_t BBHash = 0;
333+
const bool IsBranchEntry = InputOffset & BRANCHENTRY;
334+
if (!IsBranchEntry) {
335+
BBHash = DE.getU64(&Offset, &Err);
336+
// Map basic block hash to hot fragment by input offset
337+
FuncHashes[HotAddress].second.emplace(InputOffset >> 1, BBHash);
338+
}
339+
LLVM_DEBUG({
340+
dbgs() << formatv(
341+
"{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b), {6:x}", OutputOffset,
342+
InputOffset, OutputDelta, getULEB128Size(OutputDelta), InputDelta,
343+
(J < EqualElems) ? 0 : getSLEB128Size(InputDelta), OutputAddress);
344+
if (BBHash)
345+
dbgs() << formatv(" {0:x}", BBHash);
346+
dbgs() << '\n';
347+
});
297348
}
298349
Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
299350
}
@@ -303,7 +354,12 @@ void BoltAddressTranslation::dump(raw_ostream &OS) {
303354
const size_t NumTables = Maps.size();
304355
OS << "BAT tables for " << NumTables << " functions:\n";
305356
for (const auto &MapEntry : Maps) {
306-
OS << "Function Address: 0x" << Twine::utohexstr(MapEntry.first) << "\n";
357+
const uint64_t Address = MapEntry.first;
358+
const uint64_t HotAddress = fetchParentAddress(Address);
359+
OS << "Function Address: 0x" << Twine::utohexstr(Address);
360+
if (HotAddress == 0)
361+
OS << formatv(", hash: {0:x}", getBFHash(Address));
362+
OS << "\n";
307363
OS << "BB mappings:\n";
308364
for (const auto &Entry : MapEntry.second) {
309365
const bool IsBranch = Entry.second & BRANCHENTRY;
@@ -312,6 +368,9 @@ void BoltAddressTranslation::dump(raw_ostream &OS) {
312368
<< "0x" << Twine::utohexstr(Val);
313369
if (IsBranch)
314370
OS << " (branch)";
371+
else
372+
OS << formatv(" hash: {0:x}",
373+
getBBHash(HotAddress ? HotAddress : Address, Val));
315374
OS << "\n";
316375
}
317376
OS << "\n";
@@ -439,5 +498,15 @@ void BoltAddressTranslation::saveMetadata(BinaryContext &BC) {
439498
BB.getHash());
440499
}
441500
}
501+
502+
size_t BoltAddressTranslation::getBBHash(uint64_t FuncOutputAddress,
503+
uint32_t BBInputOffset) const {
504+
return FuncHashes.at(FuncOutputAddress).second.at(BBInputOffset);
505+
}
506+
507+
size_t BoltAddressTranslation::getBFHash(uint64_t OutputAddress) const {
508+
return FuncHashes.at(OutputAddress).first;
509+
}
510+
442511
} // namespace bolt
443512
} // namespace llvm

bolt/lib/Profile/YAMLProfileWriter.cpp

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
#include "bolt/Core/BinaryBasicBlock.h"
1111
#include "bolt/Core/BinaryFunction.h"
1212
#include "bolt/Profile/ProfileReaderBase.h"
13-
#include "bolt/Profile/ProfileYAMLMapping.h"
1413
#include "bolt/Rewrite/RewriteInstance.h"
1514
#include "llvm/Support/CommandLine.h"
1615
#include "llvm/Support/FileSystem.h"
@@ -26,15 +25,15 @@ extern llvm::cl::opt<bool> ProfileUseDFS;
2625
namespace llvm {
2726
namespace bolt {
2827

29-
namespace {
30-
void convert(const BinaryFunction &BF,
31-
yaml::bolt::BinaryFunctionProfile &YamlBF) {
28+
yaml::bolt::BinaryFunctionProfile
29+
YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS) {
30+
yaml::bolt::BinaryFunctionProfile YamlBF;
3231
const BinaryContext &BC = BF.getBinaryContext();
3332

3433
const uint16_t LBRProfile = BF.getProfileFlags() & BinaryFunction::PF_LBR;
3534

3635
// Prepare function and block hashes
37-
BF.computeHash(opts::ProfileUseDFS);
36+
BF.computeHash(UseDFS);
3837
BF.computeBlockHashes();
3938

4039
YamlBF.Name = BF.getPrintName();
@@ -44,7 +43,7 @@ void convert(const BinaryFunction &BF,
4443
YamlBF.ExecCount = BF.getKnownExecutionCount();
4544

4645
BinaryFunction::BasicBlockOrderType Order;
47-
llvm::copy(opts::ProfileUseDFS ? BF.dfs() : BF.getLayout().blocks(),
46+
llvm::copy(UseDFS ? BF.dfs() : BF.getLayout().blocks(),
4847
std::back_inserter(Order));
4948

5049
for (const BinaryBasicBlock *BB : Order) {
@@ -106,20 +105,14 @@ void convert(const BinaryFunction &BF,
106105
TargetName = Callee->getOneName();
107106
}
108107

108+
auto getAnnotationWithDefault = [&](const MCInst &Inst, StringRef Ann) {
109+
return BC.MIB->getAnnotationWithDefault(Instr, Ann, 0ull);
110+
};
109111
if (BC.MIB->getConditionalTailCall(Instr)) {
110-
auto CTCCount =
111-
BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "CTCTakenCount");
112-
if (CTCCount) {
113-
CSI.Count = *CTCCount;
114-
auto CTCMispreds =
115-
BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "CTCMispredCount");
116-
if (CTCMispreds)
117-
CSI.Mispreds = *CTCMispreds;
118-
}
112+
CSI.Count = getAnnotationWithDefault(Instr, "CTCTakenCount");
113+
CSI.Mispreds = getAnnotationWithDefault(Instr, "CTCMispredCount");
119114
} else {
120-
auto Count = BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "Count");
121-
if (Count)
122-
CSI.Count = *Count;
115+
CSI.Count = getAnnotationWithDefault(Instr, "Count");
123116
}
124117

125118
if (CSI.Count)
@@ -165,8 +158,8 @@ void convert(const BinaryFunction &BF,
165158

166159
YamlBF.Blocks.emplace_back(YamlBB);
167160
}
161+
return YamlBF;
168162
}
169-
} // end anonymous namespace
170163

171164
std::error_code YAMLProfileWriter::writeProfile(const RewriteInstance &RI) {
172165
const BinaryContext &BC = RI.getBinaryContext();
@@ -222,9 +215,7 @@ std::error_code YAMLProfileWriter::writeProfile(const RewriteInstance &RI) {
222215
if (!BF.hasValidProfile() && !RI.getProfileReader()->isTrustedSource())
223216
continue;
224217

225-
yaml::bolt::BinaryFunctionProfile YamlBF;
226-
convert(BF, YamlBF);
227-
BP.Functions.emplace_back(YamlBF);
218+
BP.Functions.emplace_back(convert(BF, opts::ProfileUseDFS));
228219
}
229220
}
230221

bolt/test/X86/bolt-address-translation.test

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,21 +36,22 @@
3636
#
3737
# CHECK: BOLT: 3 out of 7 functions were overwritten.
3838
# CHECK: BOLT-INFO: Wrote 6 BAT maps
39-
# CHECK: BOLT-INFO: BAT section size (bytes): 336
39+
# CHECK: BOLT-INFO: Wrote 3 function and 58 basic block hashes
40+
# CHECK: BOLT-INFO: BAT section size (bytes): 816
4041
#
4142
# usqrt mappings (hot part). We match against any key (left side containing
4243
# the bolted binary offsets) because BOLT may change where it puts instructions
4344
# depending on whether it is relaxing a branch or not. But the original input
4445
# binary offsets (right side) should be the same because these addresses are
4546
# hardcoded in the blarge.yaml file.
4647
#
47-
# CHECK-BAT-DUMP: Function Address: 0x401170
48+
# CHECK-BAT-DUMP: Function Address: 0x401170, hash: 0xace6cbc638b31983
4849
# CHECK-BAT-DUMP-NEXT: BB mappings:
49-
# CHECK-BAT-DUMP-NEXT: 0x0 -> 0x0
50+
# CHECK-BAT-DUMP-NEXT: 0x0 -> 0x0 hash: 0x36007ba1d80c0000
5051
# CHECK-BAT-DUMP-NEXT: 0x8 -> 0x8 (branch)
51-
# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x39
52+
# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x39 hash: 0x5c06705524800039
5253
# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x3d (branch)
53-
# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x10
54+
# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x10 hash: 0xd70d7a64320e0010
5455
# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x30 (branch)
5556
#
5657
# CHECK-BAT-DUMP: 3 cold mappings

0 commit comments

Comments
 (0)