Skip to content

Commit fde2d23

Browse files
[PGO][OpenMP] Instrumentation for GPU devices (Revision of #76587) (#102691)
This pull request is a revised version of #76587. This pull request fixes some build issues that were present in the previous version of this change. > This pull request is the first part of an ongoing effort to extends PGO instrumentation to GPU device code. This PR makes the following changes: > > - Adds blank registration functions to device RTL > - Gives PGO globals protected visibility when targeting a supported GPU > - Handles any addrspace casts for PGO calls > - Implements PGO global extraction in GPU plugins (currently only dumps info) > > These changes can be tested by supplying `-fprofile-instrument=clang` while targeting a GPU.
1 parent ded6dd2 commit fde2d23

File tree

17 files changed

+357
-28
lines changed

17 files changed

+357
-28
lines changed

clang/lib/CodeGen/CodeGenPGO.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,10 +1195,15 @@ void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,
11951195

11961196
unsigned Counter = (*RegionCounterMap)[S];
11971197

1198-
llvm::Value *Args[] = {FuncNameVar,
1199-
Builder.getInt64(FunctionHash),
1200-
Builder.getInt32(NumRegionCounters),
1201-
Builder.getInt32(Counter), StepV};
1198+
// Make sure that pointer to global is passed in with zero addrspace
1199+
// This is relevant during GPU profiling
1200+
auto *NormalizedFuncNameVarPtr =
1201+
llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
1202+
FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0));
1203+
1204+
llvm::Value *Args[] = {
1205+
NormalizedFuncNameVarPtr, Builder.getInt64(FunctionHash),
1206+
Builder.getInt32(NumRegionCounters), Builder.getInt32(Counter), StepV};
12021207

12031208
if (llvm::EnableSingleByteCoverage)
12041209
Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_cover),

llvm/include/llvm/Frontend/OpenMP/OMPKinds.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
506506
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
507507
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
508508

509+
__OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr)
510+
__OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64)
511+
509512
__OMP_RTL(__last, false, Void, )
510513

511514
#undef __OMP_RTL

llvm/include/llvm/ProfileData/InstrProf.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ inline StringRef getInstrProfBitmapBiasVarName() {
181181
/// Return the marker used to separate PGO names during serialization.
182182
inline StringRef getInstrProfNameSeparator() { return "\01"; }
183183

184+
/// Determines whether module targets a GPU eligable for PGO
185+
/// instrumentation
186+
bool isGPUProfTarget(const Module &M);
187+
184188
/// Please use getIRPGOFuncName for LLVM IR instrumentation. This function is
185189
/// for front-end (Clang, etc) instrumentation.
186190
/// Return the modified name for function \c F suitable to be

llvm/lib/ProfileData/InstrProf.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -437,13 +437,31 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
437437
return VarName;
438438
}
439439

440+
bool isGPUProfTarget(const Module &M) {
441+
const auto &T = Triple(M.getTargetTriple());
442+
return T.isAMDGPU() || T.isNVPTX();
443+
}
444+
445+
void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) {
446+
// If the target is a GPU, make the symbol protected so it can
447+
// be read from the host device
448+
if (isGPUProfTarget(M))
449+
FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility);
450+
// Hide the symbol so that we correctly get a copy for each executable.
451+
else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
452+
FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
453+
}
454+
440455
GlobalVariable *createPGOFuncNameVar(Module &M,
441456
GlobalValue::LinkageTypes Linkage,
442457
StringRef PGOFuncName) {
458+
// Ensure profiling variables on GPU are visible to be read from host
459+
if (isGPUProfTarget(M))
460+
Linkage = GlobalValue::ExternalLinkage;
443461
// We generally want to match the function's linkage, but available_externally
444462
// and extern_weak both have the wrong semantics, and anything that doesn't
445463
// need to link across compilation units doesn't need to be visible at all.
446-
if (Linkage == GlobalValue::ExternalWeakLinkage)
464+
else if (Linkage == GlobalValue::ExternalWeakLinkage)
447465
Linkage = GlobalValue::LinkOnceAnyLinkage;
448466
else if (Linkage == GlobalValue::AvailableExternallyLinkage)
449467
Linkage = GlobalValue::LinkOnceODRLinkage;
@@ -457,10 +475,7 @@ GlobalVariable *createPGOFuncNameVar(Module &M,
457475
new GlobalVariable(M, Value->getType(), true, Linkage, Value,
458476
getPGOFuncNameVarName(PGOFuncName, Linkage));
459477

460-
// Hide the symbol so that we correctly get a copy for each executable.
461-
if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
462-
FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
463-
478+
setPGOFuncVisibility(M, FuncNameVar);
464479
return FuncNameVar;
465480
}
466481

llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,6 +1059,8 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
10591059
llvm::InstrProfValueKind::IPVK_MemOPSize);
10601060
CallInst *Call = nullptr;
10611061
auto *TLI = &GetTLI(*Ind->getFunction());
1062+
auto *NormalizedDataVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
1063+
DataVar, PointerType::get(M.getContext(), 0));
10621064

10631065
// To support value profiling calls within Windows exception handlers, funclet
10641066
// information contained within operand bundles needs to be copied over to
@@ -1067,11 +1069,13 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
10671069
SmallVector<OperandBundleDef, 1> OpBundles;
10681070
Ind->getOperandBundlesAsDefs(OpBundles);
10691071
if (!IsMemOpSize) {
1070-
Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)};
1072+
Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr,
1073+
Builder.getInt32(Index)};
10711074
Call = Builder.CreateCall(getOrInsertValueProfilingCall(M, *TLI), Args,
10721075
OpBundles);
10731076
} else {
1074-
Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)};
1077+
Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr,
1078+
Builder.getInt32(Index)};
10751079
Call = Builder.CreateCall(
10761080
getOrInsertValueProfilingCall(M, *TLI, ValueProfilingCallType::MemOp),
10771081
Args, OpBundles);
@@ -1814,7 +1818,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
18141818
getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
18151819
ValuesVar->setAlignment(Align(8));
18161820
maybeSetComdat(ValuesVar, Fn, CntsVarName);
1817-
ValuesPtrExpr = ValuesVar;
1821+
ValuesPtrExpr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
1822+
ValuesVar, PointerType::get(Fn->getContext(), 0));
18181823
}
18191824

18201825
uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
@@ -1838,6 +1843,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
18381843
for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
18391844
Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
18401845

1846+
if (isGPUProfTarget(M)) {
1847+
Linkage = GlobalValue::ExternalLinkage;
1848+
Visibility = GlobalValue::ProtectedVisibility;
1849+
}
18411850
// If the data variable is not referenced by code (if we don't emit
18421851
// @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
18431852
// data variable live under linker GC, the data variable can be private. This
@@ -1849,9 +1858,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
18491858
// If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
18501859
// that other copies must have the same CFG and cannot have value profiling.
18511860
// If no hash suffix, other profd copies may be referenced by code.
1852-
if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
1853-
(TT.isOSBinFormatELF() ||
1854-
(!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
1861+
else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
1862+
(TT.isOSBinFormatELF() ||
1863+
(!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
18551864
Linkage = GlobalValue::PrivateLinkage;
18561865
Visibility = GlobalValue::DefaultVisibility;
18571866
}
@@ -1974,6 +1983,13 @@ void InstrLowerer::emitNameData() {
19741983
NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
19751984
GlobalValue::PrivateLinkage, NamesVal,
19761985
getInstrProfNamesVarName());
1986+
1987+
// Make names variable public if current target is a GPU
1988+
if (isGPUProfTarget(M)) {
1989+
NamesVar->setLinkage(GlobalValue::ExternalLinkage);
1990+
NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility);
1991+
}
1992+
19771993
NamesSize = CompressedNameStr.size();
19781994
setGlobalVariableLargeSection(TT, *NamesVar);
19791995
NamesVar->setSection(
@@ -2040,10 +2056,13 @@ void InstrLowerer::emitRegistration() {
20402056
IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", RegisterF));
20412057
for (Value *Data : CompilerUsedVars)
20422058
if (!isa<Function>(Data))
2043-
IRB.CreateCall(RuntimeRegisterF, Data);
2059+
// Check for addrspace cast when profiling GPU
2060+
IRB.CreateCall(RuntimeRegisterF,
2061+
IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
20442062
for (Value *Data : UsedVars)
20452063
if (Data != NamesVar && !isa<Function>(Data))
2046-
IRB.CreateCall(RuntimeRegisterF, Data);
2064+
IRB.CreateCall(RuntimeRegisterF,
2065+
IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
20472066

20482067
if (NamesVar) {
20492068
Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
@@ -2052,7 +2071,9 @@ void InstrLowerer::emitRegistration() {
20522071
auto *NamesRegisterF =
20532072
Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
20542073
getInstrProfNamesRegFuncName(), M);
2055-
IRB.CreateCall(NamesRegisterF, {NamesVar, IRB.getInt64(NamesSize)});
2074+
IRB.CreateCall(NamesRegisterF, {IRB.CreatePointerBitCastOrAddrSpaceCast(
2075+
NamesVar, VoidPtrTy),
2076+
IRB.getInt64(NamesSize)});
20562077
}
20572078

20582079
IRB.CreateRetVoid();
@@ -2073,7 +2094,10 @@ bool InstrLowerer::emitRuntimeHook() {
20732094
auto *Var =
20742095
new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
20752096
nullptr, getInstrProfRuntimeHookVarName());
2076-
Var->setVisibility(GlobalValue::HiddenVisibility);
2097+
if (isGPUProfTarget(M))
2098+
Var->setVisibility(GlobalValue::ProtectedVisibility);
2099+
else
2100+
Var->setVisibility(GlobalValue::HiddenVisibility);
20772101

20782102
if (TT.isOSBinFormatELF() && !TT.isPS()) {
20792103
// Mark the user variable as used so that it isn't stripped out.

llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -909,14 +909,18 @@ void FunctionInstrumenter::instrument() {
909909
auto Name = FuncInfo.FuncNameVar;
910910
auto CFGHash =
911911
ConstantInt::get(Type::getInt64Ty(M.getContext()), FuncInfo.FunctionHash);
912+
// Make sure that pointer to global is passed in with zero addrspace
913+
// This is relevant during GPU profiling
914+
auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
915+
Name, PointerType::get(M.getContext(), 0));
912916
if (PGOFunctionEntryCoverage) {
913917
auto &EntryBB = F.getEntryBlock();
914918
IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
915919
// llvm.instrprof.cover(i8* <name>, i64 <hash>, i32 <num-counters>,
916920
// i32 <index>)
917921
Builder.CreateCall(
918922
Intrinsic::getDeclaration(&M, Intrinsic::instrprof_cover),
919-
{Name, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
923+
{NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
920924
return;
921925
}
922926

@@ -971,7 +975,8 @@ void FunctionInstrumenter::instrument() {
971975
// i32 <index>)
972976
Builder.CreateCall(
973977
Intrinsic::getDeclaration(&M, Intrinsic::instrprof_timestamp),
974-
{Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)});
978+
{NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
979+
Builder.getInt32(I)});
975980
I += PGOBlockCoverage ? 8 : 1;
976981
}
977982

@@ -985,7 +990,8 @@ void FunctionInstrumenter::instrument() {
985990
Intrinsic::getDeclaration(&M, PGOBlockCoverage
986991
? Intrinsic::instrprof_cover
987992
: Intrinsic::instrprof_increment),
988-
{Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I++)});
993+
{NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
994+
Builder.getInt32(I++)});
989995
}
990996

991997
// Now instrument select instructions:
@@ -1028,11 +1034,14 @@ void FunctionInstrumenter::instrument() {
10281034
ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
10291035
assert(ToProfile && "value profiling Value is of unexpected type");
10301036

1037+
auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
1038+
Name, PointerType::get(M.getContext(), 0));
1039+
10311040
SmallVector<OperandBundleDef, 1> OpBundles;
10321041
populateEHOperandBundle(Cand, BlockColors, OpBundles);
10331042
Builder.CreateCall(
10341043
Intrinsic::getDeclaration(&M, Intrinsic::instrprof_value_profile),
1035-
{FuncInfo.FuncNameVar, Builder.getInt64(FuncInfo.FunctionHash),
1044+
{NormalizedNamePtr, Builder.getInt64(FuncInfo.FunctionHash),
10361045
ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
10371046
OpBundles);
10381047
}
@@ -1709,10 +1718,13 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
17091718
IRBuilder<> Builder(&SI);
17101719
Type *Int64Ty = Builder.getInt64Ty();
17111720
auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
1721+
auto *NormalizedFuncNameVarPtr =
1722+
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
1723+
FuncNameVar, PointerType::get(M->getContext(), 0));
17121724
Builder.CreateCall(
17131725
Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
1714-
{FuncNameVar, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
1715-
Builder.getInt32(*CurCtrIdx), Step});
1726+
{NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
1727+
Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
17161728
++(*CurCtrIdx);
17171729
}
17181730

offload/DeviceRTL/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ set(include_files
7777
${include_directory}/Interface.h
7878
${include_directory}/LibC.h
7979
${include_directory}/Mapping.h
80+
${include_directory}/Profiling.h
8081
${include_directory}/State.h
8182
${include_directory}/Synchronization.h
8283
${include_directory}/Types.h
@@ -93,6 +94,7 @@ set(src_files
9394
${source_directory}/Mapping.cpp
9495
${source_directory}/Misc.cpp
9596
${source_directory}/Parallelism.cpp
97+
${source_directory}/Profiling.cpp
9698
${source_directory}/Reduction.cpp
9799
${source_directory}/State.cpp
98100
${source_directory}/Synchronization.cpp

offload/DeviceRTL/include/Profiling.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
#ifndef OMPTARGET_DEVICERTL_PROFILING_H
13+
#define OMPTARGET_DEVICERTL_PROFILING_H
14+
15+
extern "C" {
16+
void __llvm_profile_register_function(void *Ptr);
17+
void __llvm_profile_register_names_function(void *Ptr, long int I);
18+
void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
19+
}
20+
21+
#endif

offload/DeviceRTL/src/Profiling.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
//===------- Profiling.cpp ---------------------------------------- C++ ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "Profiling.h"
10+
11+
#pragma omp begin declare target device_type(nohost)
12+
13+
extern "C" {
14+
15+
// Provides empty implementations for certain functions in compiler-rt
16+
// that are emitted by the PGO instrumentation.
17+
void __llvm_profile_register_function(void *Ptr) {}
18+
void __llvm_profile_register_names_function(void *Ptr, long int I) {}
19+
void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
20+
}
21+
22+
#pragma omp end declare target

offload/plugins-nextgen/common/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ add_library(PluginCommon OBJECT
77
src/RPC.cpp
88
src/Utils/ELF.cpp
99
)
10-
add_dependencies(PluginCommon intrinsics_gen)
10+
add_dependencies(PluginCommon intrinsics_gen LLVMProfileData)
1111

1212
# Only enable JIT for those targets that LLVM can support.
1313
set(supported_jit_targets AMDGPU NVPTX)
@@ -52,6 +52,7 @@ target_compile_definitions(PluginCommon PRIVATE
5252

5353
target_compile_options(PluginCommon PUBLIC ${offload_compile_flags})
5454
target_link_options(PluginCommon PUBLIC ${offload_link_flags})
55+
target_link_libraries(PluginCommon PRIVATE LLVMProfileData)
5556

5657
target_include_directories(PluginCommon PUBLIC
5758
${CMAKE_CURRENT_SOURCE_DIR}/include

0 commit comments

Comments
 (0)