Skip to content

[PGO][OpenMP] Instrumentation for GPU devices (Revision of #76587) #102691

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions clang/lib/CodeGen/CodeGenPGO.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1195,10 +1195,15 @@ void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,

unsigned Counter = (*RegionCounterMap)[S];

llvm::Value *Args[] = {FuncNameVar,
Builder.getInt64(FunctionHash),
Builder.getInt32(NumRegionCounters),
Builder.getInt32(Counter), StepV};
// Make sure that pointer to global is passed in with zero addrspace
// This is relevant during GPU profiling
auto *NormalizedFuncNameVarPtr =
llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0));

llvm::Value *Args[] = {
NormalizedFuncNameVarPtr, Builder.getInt64(FunctionHash),
Builder.getInt32(NumRegionCounters), Builder.getInt32(Counter), StepV};

if (llvm::EnableSingleByteCoverage)
Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_cover),
Expand Down
3 changes: 3 additions & 0 deletions llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)

__OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr)
__OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64)

__OMP_RTL(__last, false, Void, )

#undef __OMP_RTL
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/ProfileData/InstrProf.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ inline StringRef getInstrProfBitmapBiasVarName() {
/// Return the marker used to separate PGO names during serialization.
inline StringRef getInstrProfNameSeparator() { return "\01"; }

/// Determines whether module targets a GPU eligable for PGO
/// instrumentation
bool isGPUProfTarget(const Module &M);

/// Please use getIRPGOFuncName for LLVM IR instrumentation. This function is
/// for front-end (Clang, etc) instrumentation.
/// Return the modified name for function \c F suitable to be
Expand Down
25 changes: 20 additions & 5 deletions llvm/lib/ProfileData/InstrProf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,13 +437,31 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
return VarName;
}

bool isGPUProfTarget(const Module &M) {
const auto &T = Triple(M.getTargetTriple());
return T.isAMDGPU() || T.isNVPTX();
}

void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) {
// If the target is a GPU, make the symbol protected so it can
// be read from the host device
if (isGPUProfTarget(M))
FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility);
// Hide the symbol so that we correctly get a copy for each executable.
else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
}

GlobalVariable *createPGOFuncNameVar(Module &M,
GlobalValue::LinkageTypes Linkage,
StringRef PGOFuncName) {
// Ensure profiling variables on GPU are visible to be read from host
if (isGPUProfTarget(M))
Linkage = GlobalValue::ExternalLinkage;
// We generally want to match the function's linkage, but available_externally
// and extern_weak both have the wrong semantics, and anything that doesn't
// need to link across compilation units doesn't need to be visible at all.
if (Linkage == GlobalValue::ExternalWeakLinkage)
else if (Linkage == GlobalValue::ExternalWeakLinkage)
Linkage = GlobalValue::LinkOnceAnyLinkage;
else if (Linkage == GlobalValue::AvailableExternallyLinkage)
Linkage = GlobalValue::LinkOnceODRLinkage;
Expand All @@ -457,10 +475,7 @@ GlobalVariable *createPGOFuncNameVar(Module &M,
new GlobalVariable(M, Value->getType(), true, Linkage, Value,
getPGOFuncNameVarName(PGOFuncName, Linkage));

// Hide the symbol so that we correctly get a copy for each executable.
if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);

setPGOFuncVisibility(M, FuncNameVar);
return FuncNameVar;
}

Expand Down
44 changes: 34 additions & 10 deletions llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1059,6 +1059,8 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
llvm::InstrProfValueKind::IPVK_MemOPSize);
CallInst *Call = nullptr;
auto *TLI = &GetTLI(*Ind->getFunction());
auto *NormalizedDataVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
DataVar, PointerType::get(M.getContext(), 0));

// To support value profiling calls within Windows exception handlers, funclet
// information contained within operand bundles needs to be copied over to
Expand All @@ -1067,11 +1069,13 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
SmallVector<OperandBundleDef, 1> OpBundles;
Ind->getOperandBundlesAsDefs(OpBundles);
if (!IsMemOpSize) {
Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)};
Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr,
Builder.getInt32(Index)};
Call = Builder.CreateCall(getOrInsertValueProfilingCall(M, *TLI), Args,
OpBundles);
} else {
Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)};
Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr,
Builder.getInt32(Index)};
Call = Builder.CreateCall(
getOrInsertValueProfilingCall(M, *TLI, ValueProfilingCallType::MemOp),
Args, OpBundles);
Expand Down Expand Up @@ -1814,7 +1818,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
ValuesVar->setAlignment(Align(8));
maybeSetComdat(ValuesVar, Fn, CntsVarName);
ValuesPtrExpr = ValuesVar;
ValuesPtrExpr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
ValuesVar, PointerType::get(Fn->getContext(), 0));
}

uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
Expand All @@ -1838,6 +1843,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);

if (isGPUProfTarget(M)) {
Linkage = GlobalValue::ExternalLinkage;
Visibility = GlobalValue::ProtectedVisibility;
}
// If the data variable is not referenced by code (if we don't emit
// @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
// data variable live under linker GC, the data variable can be private. This
Expand All @@ -1849,9 +1858,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
// that other copies must have the same CFG and cannot have value profiling.
// If no hash suffix, other profd copies may be referenced by code.
if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
(TT.isOSBinFormatELF() ||
(!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
(TT.isOSBinFormatELF() ||
(!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
Linkage = GlobalValue::PrivateLinkage;
Visibility = GlobalValue::DefaultVisibility;
}
Expand Down Expand Up @@ -1974,6 +1983,13 @@ void InstrLowerer::emitNameData() {
NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
GlobalValue::PrivateLinkage, NamesVal,
getInstrProfNamesVarName());

// Make names variable public if current target is a GPU
if (isGPUProfTarget(M)) {
NamesVar->setLinkage(GlobalValue::ExternalLinkage);
NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility);
}

NamesSize = CompressedNameStr.size();
setGlobalVariableLargeSection(TT, *NamesVar);
NamesVar->setSection(
Expand Down Expand Up @@ -2040,10 +2056,13 @@ void InstrLowerer::emitRegistration() {
IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", RegisterF));
for (Value *Data : CompilerUsedVars)
if (!isa<Function>(Data))
IRB.CreateCall(RuntimeRegisterF, Data);
// Check for addrspace cast when profiling GPU
IRB.CreateCall(RuntimeRegisterF,
IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
for (Value *Data : UsedVars)
if (Data != NamesVar && !isa<Function>(Data))
IRB.CreateCall(RuntimeRegisterF, Data);
IRB.CreateCall(RuntimeRegisterF,
IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));

if (NamesVar) {
Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
Expand All @@ -2052,7 +2071,9 @@ void InstrLowerer::emitRegistration() {
auto *NamesRegisterF =
Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
getInstrProfNamesRegFuncName(), M);
IRB.CreateCall(NamesRegisterF, {NamesVar, IRB.getInt64(NamesSize)});
IRB.CreateCall(NamesRegisterF, {IRB.CreatePointerBitCastOrAddrSpaceCast(
NamesVar, VoidPtrTy),
IRB.getInt64(NamesSize)});
}

IRB.CreateRetVoid();
Expand All @@ -2073,7 +2094,10 @@ bool InstrLowerer::emitRuntimeHook() {
auto *Var =
new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
nullptr, getInstrProfRuntimeHookVarName());
Var->setVisibility(GlobalValue::HiddenVisibility);
if (isGPUProfTarget(M))
Var->setVisibility(GlobalValue::ProtectedVisibility);
else
Var->setVisibility(GlobalValue::HiddenVisibility);

if (TT.isOSBinFormatELF() && !TT.isPS()) {
// Mark the user variable as used so that it isn't stripped out.
Expand Down
24 changes: 18 additions & 6 deletions llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -909,14 +909,18 @@ void FunctionInstrumenter::instrument() {
auto Name = FuncInfo.FuncNameVar;
auto CFGHash =
ConstantInt::get(Type::getInt64Ty(M.getContext()), FuncInfo.FunctionHash);
// Make sure that pointer to global is passed in with zero addrspace
// This is relevant during GPU profiling
auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
Name, PointerType::get(M.getContext(), 0));
if (PGOFunctionEntryCoverage) {
auto &EntryBB = F.getEntryBlock();
IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
// llvm.instrprof.cover(i8* <name>, i64 <hash>, i32 <num-counters>,
// i32 <index>)
Builder.CreateCall(
Intrinsic::getDeclaration(&M, Intrinsic::instrprof_cover),
{Name, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
{NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
return;
}

Expand Down Expand Up @@ -971,7 +975,8 @@ void FunctionInstrumenter::instrument() {
// i32 <index>)
Builder.CreateCall(
Intrinsic::getDeclaration(&M, Intrinsic::instrprof_timestamp),
{Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)});
{NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
Builder.getInt32(I)});
I += PGOBlockCoverage ? 8 : 1;
}

Expand All @@ -985,7 +990,8 @@ void FunctionInstrumenter::instrument() {
Intrinsic::getDeclaration(&M, PGOBlockCoverage
? Intrinsic::instrprof_cover
: Intrinsic::instrprof_increment),
{Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I++)});
{NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
Builder.getInt32(I++)});
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to change the name used to profile the select instructions below as well?

Specifically, for this line https://github.com/llvm/llvm-project/pull/102691/files#diff-618e0bf5dc7c44fae29843f3be69506a1b0a1409c7f018c61b0acf593f1c4605R998, should we use NormalizedNamePtr instead of FuncInfo.FuncNameVar? Maybe it does not matter but I'd like to understand if this is an explicit choice.

Thanks so much!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for reaching out. instrumentSelects accepts a GlobalVariable as an argument, not a Value. The global variable addrspace cast handling is done inside SelectInstVisitor::instrumentOneSelectInst:

auto *NormalizedFuncNameVarPtr =
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
FuncNameVar, PointerType::get(M->getContext(), 0));
Builder.CreateCall(
Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
{NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});

I haven't encountered any errors raised by this snippet, but please let me know if you've been having any issues with it on your end.

}

// Now instrument select instructions:
Expand Down Expand Up @@ -1028,11 +1034,14 @@ void FunctionInstrumenter::instrument() {
ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
assert(ToProfile && "value profiling Value is of unexpected type");

auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
Name, PointerType::get(M.getContext(), 0));

SmallVector<OperandBundleDef, 1> OpBundles;
populateEHOperandBundle(Cand, BlockColors, OpBundles);
Builder.CreateCall(
Intrinsic::getDeclaration(&M, Intrinsic::instrprof_value_profile),
{FuncInfo.FuncNameVar, Builder.getInt64(FuncInfo.FunctionHash),
{NormalizedNamePtr, Builder.getInt64(FuncInfo.FunctionHash),
ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
OpBundles);
}
Expand Down Expand Up @@ -1709,10 +1718,13 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
IRBuilder<> Builder(&SI);
Type *Int64Ty = Builder.getInt64Ty();
auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
auto *NormalizedFuncNameVarPtr =
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
FuncNameVar, PointerType::get(M->getContext(), 0));
Builder.CreateCall(
Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
{FuncNameVar, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
Builder.getInt32(*CurCtrIdx), Step});
{NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
++(*CurCtrIdx);
}

Expand Down
2 changes: 2 additions & 0 deletions offload/DeviceRTL/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ set(include_files
${include_directory}/Interface.h
${include_directory}/LibC.h
${include_directory}/Mapping.h
${include_directory}/Profiling.h
${include_directory}/State.h
${include_directory}/Synchronization.h
${include_directory}/Types.h
Expand All @@ -93,6 +94,7 @@ set(src_files
${source_directory}/Mapping.cpp
${source_directory}/Misc.cpp
${source_directory}/Parallelism.cpp
${source_directory}/Profiling.cpp
${source_directory}/Reduction.cpp
${source_directory}/State.cpp
${source_directory}/Synchronization.cpp
Expand Down
21 changes: 21 additions & 0 deletions offload/DeviceRTL/include/Profiling.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//

#ifndef OMPTARGET_DEVICERTL_PROFILING_H
#define OMPTARGET_DEVICERTL_PROFILING_H

extern "C" {
void __llvm_profile_register_function(void *Ptr);
void __llvm_profile_register_names_function(void *Ptr, long int I);
void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
}

#endif
22 changes: 22 additions & 0 deletions offload/DeviceRTL/src/Profiling.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===------- Profiling.cpp ---------------------------------------- C++ ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "Profiling.h"

#pragma omp begin declare target device_type(nohost)

extern "C" {

// Provides empty implementations for certain functions in compiler-rt
// that are emitted by the PGO instrumentation.
void __llvm_profile_register_function(void *Ptr) {}
void __llvm_profile_register_names_function(void *Ptr, long int I) {}
void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
}

#pragma omp end declare target
3 changes: 2 additions & 1 deletion offload/plugins-nextgen/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ add_library(PluginCommon OBJECT
src/RPC.cpp
src/Utils/ELF.cpp
)
add_dependencies(PluginCommon intrinsics_gen)
add_dependencies(PluginCommon intrinsics_gen LLVMProfileData)

# Only enable JIT for those targets that LLVM can support.
set(supported_jit_targets AMDGPU NVPTX)
Expand Down Expand Up @@ -52,6 +52,7 @@ target_compile_definitions(PluginCommon PRIVATE

target_compile_options(PluginCommon PUBLIC ${offload_compile_flags})
target_link_options(PluginCommon PUBLIC ${offload_link_flags})
target_link_libraries(PluginCommon PRIVATE LLVMProfileData)

target_include_directories(PluginCommon PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/include
Expand Down
Loading
Loading