Skip to content

Commit 9e5c136

Browse files
[PGO][Offload] Profile profraw generation for GPU instrumentation #76587 (#93365)
This pull request is the second part of an ongoing effort to extends PGO instrumentation to GPU device code and depends on #76587. This PR makes the following changes: - Introduces `__llvm_write_custom_profile` to PGO compiler-rt library. This is an external function that can be used to write profiles with custom data to target-specific files. - Adds `__llvm_write_custom_profile` as weak symbol to libomptarget so that it can write the collected data to a profraw file. - Adds `PGODump` debug flag and only displays dump when the aforementioned flag is set
1 parent 84e3c6f commit 9e5c136

File tree

10 files changed

+209
-28
lines changed

10 files changed

+209
-28
lines changed

compiler-rt/lib/profile/InstrProfiling.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,17 @@ int __llvm_profile_get_padding_sizes_for_counters(
304304
*/
305305
void __llvm_profile_set_dumped(void);
306306

307+
/*!
308+
* \brief Write custom target-specific profiling data to a seperate file.
309+
* Used by offload PGO.
310+
*/
311+
int __llvm_write_custom_profile(const char *Target,
312+
const __llvm_profile_data *DataBegin,
313+
const __llvm_profile_data *DataEnd,
314+
const char *CountersBegin,
315+
const char *CountersEnd, const char *NamesBegin,
316+
const char *NamesEnd);
317+
307318
/*!
308319
* This variable is defined in InstrProfilingRuntime.cpp as a hidden
309320
* symbol. Its main purpose is to enable profile runtime user to

compiler-rt/lib/profile/InstrProfilingFile.c

Lines changed: 115 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,17 @@ static FILE *getFileObject(const char *OutputName) {
541541
return fopen(OutputName, "ab");
542542
}
543543

544+
static void closeFileObject(FILE *OutputFile) {
545+
if (OutputFile == getProfileFile()) {
546+
fflush(OutputFile);
547+
if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) {
548+
lprofUnlockFileHandle(OutputFile);
549+
}
550+
} else {
551+
fclose(OutputFile);
552+
}
553+
}
554+
544555
/* Write profile data to file \c OutputName. */
545556
static int writeFile(const char *OutputName) {
546557
int RetVal;
@@ -562,15 +573,7 @@ static int writeFile(const char *OutputName) {
562573
initFileWriter(&fileWriter, OutputFile);
563574
RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone);
564575

565-
if (OutputFile == getProfileFile()) {
566-
fflush(OutputFile);
567-
if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) {
568-
lprofUnlockFileHandle(OutputFile);
569-
}
570-
} else {
571-
fclose(OutputFile);
572-
}
573-
576+
closeFileObject(OutputFile);
574577
return RetVal;
575578
}
576579

@@ -1359,4 +1362,107 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File,
13591362
return 0;
13601363
}
13611364

1365+
int __llvm_write_custom_profile(const char *Target,
1366+
const __llvm_profile_data *DataBegin,
1367+
const __llvm_profile_data *DataEnd,
1368+
const char *CountersBegin,
1369+
const char *CountersEnd, const char *NamesBegin,
1370+
const char *NamesEnd) {
1371+
int ReturnValue = 0, FilenameLength, TargetLength;
1372+
char *FilenameBuf, *TargetFilename;
1373+
const char *Filename;
1374+
1375+
/* Save old profile data */
1376+
FILE *oldFile = getProfileFile();
1377+
1378+
// Temporarily suspend getting SIGKILL when the parent exits.
1379+
int PDeathSig = lprofSuspendSigKill();
1380+
1381+
if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) {
1382+
PROF_NOTE("Profile data not written to file: %s.\n", "already written");
1383+
if (PDeathSig == 1)
1384+
lprofRestoreSigKill();
1385+
return 0;
1386+
}
1387+
1388+
/* Check if there is llvm/runtime version mismatch. */
1389+
if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) {
1390+
PROF_ERR("Runtime and instrumentation version mismatch : "
1391+
"expected %d, but get %d\n",
1392+
INSTR_PROF_RAW_VERSION,
1393+
(int)GET_VERSION(__llvm_profile_get_version()));
1394+
if (PDeathSig == 1)
1395+
lprofRestoreSigKill();
1396+
return -1;
1397+
}
1398+
1399+
/* Get current filename */
1400+
FilenameLength = getCurFilenameLength();
1401+
FilenameBuf = (char *)COMPILER_RT_ALLOCA(FilenameLength + 1);
1402+
Filename = getCurFilename(FilenameBuf, 0);
1403+
1404+
/* Check the filename. */
1405+
if (!Filename) {
1406+
PROF_ERR("Failed to write file : %s\n", "Filename not set");
1407+
if (PDeathSig == 1)
1408+
lprofRestoreSigKill();
1409+
return -1;
1410+
}
1411+
1412+
/* Allocate new space for our target-specific PGO filename */
1413+
TargetLength = strlen(Target);
1414+
TargetFilename =
1415+
(char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2);
1416+
1417+
/* Find file basename and path sizes */
1418+
int32_t DirEnd = FilenameLength - 1;
1419+
while (DirEnd >= 0 && !IS_DIR_SEPARATOR(Filename[DirEnd])) {
1420+
DirEnd--;
1421+
}
1422+
uint32_t DirSize = DirEnd + 1, BaseSize = FilenameLength - DirSize;
1423+
1424+
/* Prepend "TARGET." to current filename */
1425+
if (DirSize > 0) {
1426+
memcpy(TargetFilename, Filename, DirSize);
1427+
}
1428+
memcpy(TargetFilename + DirSize, Target, TargetLength);
1429+
TargetFilename[TargetLength + DirSize] = '.';
1430+
memcpy(TargetFilename + DirSize + 1 + TargetLength, Filename + DirSize,
1431+
BaseSize);
1432+
TargetFilename[FilenameLength + 1 + TargetLength] = 0;
1433+
1434+
/* Open and truncate target-specific PGO file */
1435+
FILE *OutputFile = fopen(TargetFilename, "w");
1436+
setProfileFile(OutputFile);
1437+
1438+
if (!OutputFile) {
1439+
PROF_ERR("Failed to open file : %s\n", TargetFilename);
1440+
if (PDeathSig == 1)
1441+
lprofRestoreSigKill();
1442+
return -1;
1443+
}
1444+
1445+
FreeHook = &free;
1446+
setupIOBuffer();
1447+
1448+
/* Write custom data */
1449+
ProfDataWriter fileWriter;
1450+
initFileWriter(&fileWriter, OutputFile);
1451+
1452+
/* Write custom data to the file */
1453+
ReturnValue = lprofWriteDataImpl(
1454+
&fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
1455+
lprofGetVPDataReader(), NULL, NULL, NULL, NULL, NamesBegin, NamesEnd, 0);
1456+
closeFileObject(OutputFile);
1457+
1458+
// Restore SIGKILL.
1459+
if (PDeathSig == 1)
1460+
lprofRestoreSigKill();
1461+
1462+
/* Restore old profiling file */
1463+
setProfileFile(oldFile);
1464+
1465+
return ReturnValue;
1466+
}
1467+
13621468
#endif

offload/include/Shared/Environment.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ enum class DeviceDebugKind : uint32_t {
3030
FunctionTracing = 1U << 1,
3131
CommonIssues = 1U << 2,
3232
AllocationTracker = 1U << 3,
33+
PGODump = 1U << 4,
3334
};
3435

3536
struct DeviceEnvironmentTy {

offload/plugins-nextgen/common/include/GlobalHandler.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,22 @@ struct __llvm_profile_data {
6363
#include "llvm/ProfileData/InstrProfData.inc"
6464
};
6565

66+
extern "C" {
67+
extern int __attribute__((weak)) __llvm_write_custom_profile(
68+
const char *Target, const __llvm_profile_data *DataBegin,
69+
const __llvm_profile_data *DataEnd, const char *CountersBegin,
70+
const char *CountersEnd, const char *NamesBegin, const char *NamesEnd);
71+
}
72+
6673
/// PGO profiling data extracted from a GPU device
6774
struct GPUProfGlobals {
68-
SmallVector<uint8_t> NamesData;
69-
SmallVector<SmallVector<int64_t>> Counts;
75+
SmallVector<int64_t> Counts;
7076
SmallVector<__llvm_profile_data> Data;
77+
SmallVector<uint8_t> NamesData;
7178
Triple TargetTriple;
7279

7380
void dump() const;
81+
Error write() const;
7482
};
7583

7684
/// Subclass of GlobalTy that holds the memory for a global of \p Ty.

offload/plugins-nextgen/common/src/GlobalHandler.cpp

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
206206
GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data());
207207
if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
208208
return Err;
209-
DeviceProfileData.Counts.push_back(std::move(Counts));
209+
DeviceProfileData.Counts.append(std::move(Counts));
210210
} else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) {
211211
// Read profiling data for this global variable
212212
__llvm_profile_data Data{};
@@ -224,15 +224,14 @@ void GPUProfGlobals::dump() const {
224224
<< "\n";
225225

226226
outs() << "======== Counters =========\n";
227-
for (const auto &Count : Counts) {
228-
outs() << "[";
229-
for (size_t i = 0; i < Count.size(); i++) {
230-
if (i == 0)
231-
outs() << " ";
232-
outs() << Count[i] << " ";
233-
}
234-
outs() << "]\n";
227+
for (size_t i = 0; i < Counts.size(); i++) {
228+
if (i > 0 && i % 10 == 0)
229+
outs() << "\n";
230+
else if (i != 0)
231+
outs() << " ";
232+
outs() << Counts[i];
235233
}
234+
outs() << "\n";
236235

237236
outs() << "========== Data ===========\n";
238237
for (const auto &ProfData : Data) {
@@ -264,3 +263,43 @@ void GPUProfGlobals::dump() const {
264263
Symtab.dumpNames(outs());
265264
outs() << "===========================\n";
266265
}
266+
267+
Error GPUProfGlobals::write() const {
268+
if (!__llvm_write_custom_profile)
269+
return Plugin::error("Could not find symbol __llvm_write_custom_profile. "
270+
"The compiler-rt profiling library must be linked for "
271+
"GPU PGO to work.");
272+
273+
size_t DataSize = Data.size() * sizeof(__llvm_profile_data),
274+
CountsSize = Counts.size() * sizeof(int64_t);
275+
__llvm_profile_data *DataBegin, *DataEnd;
276+
char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd;
277+
278+
// Initialize array of contiguous data. We need to make sure each section is
279+
// contiguous so that the PGO library can compute deltas properly
280+
SmallVector<uint8_t> ContiguousData(NamesData.size() + DataSize + CountsSize);
281+
282+
// Compute region pointers
283+
DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize);
284+
DataEnd =
285+
(__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize);
286+
CountersBegin = (char *)ContiguousData.data();
287+
CountersEnd = (char *)(ContiguousData.data() + CountsSize);
288+
NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize);
289+
NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize +
290+
NamesData.size());
291+
292+
// Copy data to contiguous buffer
293+
memcpy(DataBegin, Data.data(), DataSize);
294+
memcpy(CountersBegin, Counts.data(), CountsSize);
295+
memcpy(NamesBegin, NamesData.data(), NamesData.size());
296+
297+
// Invoke compiler-rt entrypoint
298+
int result = __llvm_write_custom_profile(TargetTriple.str().c_str(),
299+
DataBegin, DataEnd, CountersBegin,
300+
CountersEnd, NamesBegin, NamesEnd);
301+
if (result != 0)
302+
return Plugin::error("Error writing GPU PGO data to file");
303+
304+
return Plugin::success();
305+
}

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -861,8 +861,14 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
861861
if (!ProfOrErr)
862862
return ProfOrErr.takeError();
863863

864-
// TODO: write data to profiling file
865-
ProfOrErr->dump();
864+
// Dump out profdata
865+
if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
866+
uint32_t(DeviceDebugKind::PGODump))
867+
ProfOrErr->dump();
868+
869+
// Write data to profiling file
870+
if (auto Err = ProfOrErr->write())
871+
return Err;
866872
}
867873

868874
// Delete the memory manager before deinitializing the device. Otherwise,

offload/test/lit.cfg

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,10 @@ config.available_features.add(config.libomptarget_current_target)
112112
if config.libomptarget_has_libc:
113113
config.available_features.add('libc')
114114

115+
profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata")
115116
if config.libomptarget_test_pgo:
116117
config.available_features.add('pgo')
118+
config.substitutions.append(("%profdata", profdata_path))
117119

118120
# Determine whether the test system supports unified memory.
119121
# For CUDA, this is the case with compute capability 70 (Volta) or higher.
@@ -407,6 +409,8 @@ if config.test_fortran_compiler:
407409
config.available_features.add('flang')
408410
config.substitutions.append(("%flang", config.test_fortran_compiler))
409411

412+
config.substitutions.append(("%target_triple", config.libomptarget_current_target))
413+
410414
config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
411415
if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path:
412416
config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))

offload/test/lit.site.cfg.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
@AUTO_GEN_COMMENT@
22

3-
config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
3+
config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@"
44
config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
55
config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
66
config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@"

offload/test/offloading/pgo1.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
1-
// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
2-
// RUN: -Xclang "-fprofile-instrument=clang"
3-
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
4-
// RUN: --check-prefix="CLANG-PGO"
51
// RUN: %libomptarget-compile-generic -fprofile-generate \
62
// RUN: -Xclang "-fprofile-instrument=llvm"
7-
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
3+
// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1
4+
// RUN: %profdata show --all-functions --counts \
5+
// RUN: %target_triple.llvm.profraw | %fcheck-generic \
86
// RUN: --check-prefix="LLVM-PGO"
97

8+
// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
9+
// RUN: -Xclang "-fprofile-instrument=clang"
10+
// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1
11+
// RUN: %profdata show --all-functions --counts \
12+
// RUN: %target_triple.clang.profraw | %fcheck-generic \
13+
// RUN: --check-prefix="CLANG-PGO"
14+
1015
// REQUIRES: gpu
1116
// REQUIRES: pgo
1217

openmp/docs/design/Runtimes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1522,3 +1522,4 @@ debugging features are supported.
15221522
* Enable debugging assertions in the device. ``0x01``
15231523
* Enable diagnosing common problems during offloading . ``0x4``
15241524
* Enable device malloc statistics (amdgpu only). ``0x8``
1525+
* Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``

0 commit comments

Comments
 (0)