Skip to content

Commit 5fd2af3

Browse files
[PGO][OpenMP] Instrumentation for GPU devices (#76587)
This pull request is the first part of an ongoing effort to extends PGO instrumentation to GPU device code. This PR makes the following changes: - Adds blank registration functions to device RTL - Gives PGO globals protected visibility when targeting a supported GPU - Handles any addrspace casts for PGO calls - Implements PGO global extraction in GPU plugins (currently only dumps info) These changes can be tested by supplying `-fprofile-instrument=clang` while targeting a GPU.
1 parent 3497500 commit 5fd2af3

File tree

16 files changed

+358
-27
lines changed

16 files changed

+358
-27
lines changed

clang/lib/CodeGen/CodeGenPGO.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,10 +1193,15 @@ void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,
11931193

11941194
unsigned Counter = (*RegionCounterMap)[S];
11951195

1196-
llvm::Value *Args[] = {FuncNameVar,
1197-
Builder.getInt64(FunctionHash),
1198-
Builder.getInt32(NumRegionCounters),
1199-
Builder.getInt32(Counter), StepV};
1196+
// Make sure that pointer to global is passed in with zero addrspace
1197+
// This is relevant during GPU profiling
1198+
auto *NormalizedFuncNameVarPtr =
1199+
llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
1200+
FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0));
1201+
1202+
llvm::Value *Args[] = {
1203+
NormalizedFuncNameVarPtr, Builder.getInt64(FunctionHash),
1204+
Builder.getInt32(NumRegionCounters), Builder.getInt32(Counter), StepV};
12001205

12011206
if (llvm::EnableSingleByteCoverage)
12021207
Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_cover),

llvm/include/llvm/Frontend/OpenMP/OMPKinds.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
502502
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
503503
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
504504

505+
__OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr)
506+
__OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64)
507+
505508
__OMP_RTL(__last, false, Void, )
506509

507510
#undef __OMP_RTL

llvm/include/llvm/ProfileData/InstrProf.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,10 @@ inline StringRef getInstrProfCounterBiasVarName() {
177177
/// Return the marker used to separate PGO names during serialization.
178178
inline StringRef getInstrProfNameSeparator() { return "\01"; }
179179

180+
/// Determines whether module targets a GPU eligable for PGO
181+
/// instrumentation
182+
bool isGPUProfTarget(const Module &M);
183+
180184
/// Please use getIRPGOFuncName for LLVM IR instrumentation. This function is
181185
/// for front-end (Clang, etc) instrumentation.
182186
/// Return the modified name for function \c F suitable to be

llvm/lib/ProfileData/InstrProf.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -432,13 +432,31 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
432432
return VarName;
433433
}
434434

435+
bool isGPUProfTarget(const Module &M) {
436+
const auto &T = Triple(M.getTargetTriple());
437+
return T.isAMDGPU() || T.isNVPTX();
438+
}
439+
440+
void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) {
441+
// If the target is a GPU, make the symbol protected so it can
442+
// be read from the host device
443+
if (isGPUProfTarget(M))
444+
FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility);
445+
// Hide the symbol so that we correctly get a copy for each executable.
446+
else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
447+
FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
448+
}
449+
435450
GlobalVariable *createPGOFuncNameVar(Module &M,
436451
GlobalValue::LinkageTypes Linkage,
437452
StringRef PGOFuncName) {
453+
// Ensure profiling variables on GPU are visible to be read from host
454+
if (isGPUProfTarget(M))
455+
Linkage = GlobalValue::ExternalLinkage;
438456
// We generally want to match the function's linkage, but available_externally
439457
// and extern_weak both have the wrong semantics, and anything that doesn't
440458
// need to link across compilation units doesn't need to be visible at all.
441-
if (Linkage == GlobalValue::ExternalWeakLinkage)
459+
else if (Linkage == GlobalValue::ExternalWeakLinkage)
442460
Linkage = GlobalValue::LinkOnceAnyLinkage;
443461
else if (Linkage == GlobalValue::AvailableExternallyLinkage)
444462
Linkage = GlobalValue::LinkOnceODRLinkage;
@@ -452,10 +470,7 @@ GlobalVariable *createPGOFuncNameVar(Module &M,
452470
new GlobalVariable(M, Value->getType(), true, Linkage, Value,
453471
getPGOFuncNameVarName(PGOFuncName, Linkage));
454472

455-
// Hide the symbol so that we correctly get a copy for each executable.
456-
if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
457-
FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
458-
473+
setPGOFuncVisibility(M, FuncNameVar);
459474
return FuncNameVar;
460475
}
461476

llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,8 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
879879
llvm::InstrProfValueKind::IPVK_MemOPSize);
880880
CallInst *Call = nullptr;
881881
auto *TLI = &GetTLI(*Ind->getFunction());
882+
auto *NormalizedDataVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
883+
DataVar, PointerType::get(M.getContext(), 0));
882884

883885
// To support value profiling calls within Windows exception handlers, funclet
884886
// information contained within operand bundles needs to be copied over to
@@ -887,11 +889,13 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
887889
SmallVector<OperandBundleDef, 1> OpBundles;
888890
Ind->getOperandBundlesAsDefs(OpBundles);
889891
if (!IsMemOpSize) {
890-
Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)};
892+
Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr,
893+
Builder.getInt32(Index)};
891894
Call = Builder.CreateCall(getOrInsertValueProfilingCall(M, *TLI), Args,
892895
OpBundles);
893896
} else {
894-
Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)};
897+
Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr,
898+
Builder.getInt32(Index)};
895899
Call = Builder.CreateCall(
896900
getOrInsertValueProfilingCall(M, *TLI, ValueProfilingCallType::MemOp),
897901
Args, OpBundles);
@@ -1616,7 +1620,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
16161620
getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
16171621
ValuesVar->setAlignment(Align(8));
16181622
maybeSetComdat(ValuesVar, Fn, CntsVarName);
1619-
ValuesPtrExpr = ValuesVar;
1623+
ValuesPtrExpr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
1624+
ValuesVar, PointerType::get(Fn->getContext(), 0));
16201625
}
16211626

16221627
uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
@@ -1640,6 +1645,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
16401645
for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
16411646
Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
16421647

1648+
if (isGPUProfTarget(M)) {
1649+
Linkage = GlobalValue::ExternalLinkage;
1650+
Visibility = GlobalValue::ProtectedVisibility;
1651+
}
16431652
// If the data variable is not referenced by code (if we don't emit
16441653
// @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
16451654
// data variable live under linker GC, the data variable can be private. This
@@ -1651,9 +1660,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
16511660
// If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
16521661
// that other copies must have the same CFG and cannot have value profiling.
16531662
// If no hash suffix, other profd copies may be referenced by code.
1654-
if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
1655-
(TT.isOSBinFormatELF() ||
1656-
(!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
1663+
else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
1664+
(TT.isOSBinFormatELF() ||
1665+
(!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
16571666
Linkage = GlobalValue::PrivateLinkage;
16581667
Visibility = GlobalValue::DefaultVisibility;
16591668
}
@@ -1776,6 +1785,13 @@ void InstrLowerer::emitNameData() {
17761785
NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
17771786
GlobalValue::PrivateLinkage, NamesVal,
17781787
getInstrProfNamesVarName());
1788+
1789+
// Make names variable public if current target is a GPU
1790+
if (isGPUProfTarget(M)) {
1791+
NamesVar->setLinkage(GlobalValue::ExternalLinkage);
1792+
NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility);
1793+
}
1794+
17791795
NamesSize = CompressedNameStr.size();
17801796
setGlobalVariableLargeSection(TT, *NamesVar);
17811797
NamesVar->setSection(
@@ -1842,10 +1858,13 @@ void InstrLowerer::emitRegistration() {
18421858
IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", RegisterF));
18431859
for (Value *Data : CompilerUsedVars)
18441860
if (!isa<Function>(Data))
1845-
IRB.CreateCall(RuntimeRegisterF, Data);
1861+
// Check for addrspace cast when profiling GPU
1862+
IRB.CreateCall(RuntimeRegisterF,
1863+
IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
18461864
for (Value *Data : UsedVars)
18471865
if (Data != NamesVar && !isa<Function>(Data))
1848-
IRB.CreateCall(RuntimeRegisterF, Data);
1866+
IRB.CreateCall(RuntimeRegisterF,
1867+
IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy));
18491868

18501869
if (NamesVar) {
18511870
Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
@@ -1854,7 +1873,9 @@ void InstrLowerer::emitRegistration() {
18541873
auto *NamesRegisterF =
18551874
Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
18561875
getInstrProfNamesRegFuncName(), M);
1857-
IRB.CreateCall(NamesRegisterF, {NamesVar, IRB.getInt64(NamesSize)});
1876+
IRB.CreateCall(NamesRegisterF, {IRB.CreatePointerBitCastOrAddrSpaceCast(
1877+
NamesVar, VoidPtrTy),
1878+
IRB.getInt64(NamesSize)});
18581879
}
18591880

18601881
IRB.CreateRetVoid();
@@ -1875,7 +1896,10 @@ bool InstrLowerer::emitRuntimeHook() {
18751896
auto *Var =
18761897
new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
18771898
nullptr, getInstrProfRuntimeHookVarName());
1878-
Var->setVisibility(GlobalValue::HiddenVisibility);
1899+
if (isGPUProfTarget(M))
1900+
Var->setVisibility(GlobalValue::ProtectedVisibility);
1901+
else
1902+
Var->setVisibility(GlobalValue::HiddenVisibility);
18791903

18801904
if (TT.isOSBinFormatELF() && !TT.isPS()) {
18811905
// Mark the user variable as used so that it isn't stripped out.

llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -883,14 +883,18 @@ static void instrumentOneFunc(
883883
auto Name = FuncInfo.FuncNameVar;
884884
auto CFGHash = ConstantInt::get(Type::getInt64Ty(M->getContext()),
885885
FuncInfo.FunctionHash);
886+
// Make sure that pointer to global is passed in with zero addrspace
887+
// This is relevant during GPU profiling
888+
auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
889+
Name, PointerType::get(M->getContext(), 0));
886890
if (PGOFunctionEntryCoverage) {
887891
auto &EntryBB = F.getEntryBlock();
888892
IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
889893
// llvm.instrprof.cover(i8* <name>, i64 <hash>, i32 <num-counters>,
890894
// i32 <index>)
891895
Builder.CreateCall(
892896
Intrinsic::getDeclaration(M, Intrinsic::instrprof_cover),
893-
{Name, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
897+
{NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
894898
return;
895899
}
896900

@@ -945,7 +949,8 @@ static void instrumentOneFunc(
945949
// i32 <index>)
946950
Builder.CreateCall(
947951
Intrinsic::getDeclaration(M, Intrinsic::instrprof_timestamp),
948-
{Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)});
952+
{NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
953+
Builder.getInt32(I)});
949954
I += PGOBlockCoverage ? 8 : 1;
950955
}
951956

@@ -959,7 +964,8 @@ static void instrumentOneFunc(
959964
Intrinsic::getDeclaration(M, PGOBlockCoverage
960965
? Intrinsic::instrprof_cover
961966
: Intrinsic::instrprof_increment),
962-
{Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I++)});
967+
{NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
968+
Builder.getInt32(I++)});
963969
}
964970

965971
// Now instrument select instructions:
@@ -1002,11 +1008,14 @@ static void instrumentOneFunc(
10021008
ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
10031009
assert(ToProfile && "value profiling Value is of unexpected type");
10041010

1011+
auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
1012+
Name, PointerType::get(M->getContext(), 0));
1013+
10051014
SmallVector<OperandBundleDef, 1> OpBundles;
10061015
populateEHOperandBundle(Cand, BlockColors, OpBundles);
10071016
Builder.CreateCall(
10081017
Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
1009-
{FuncInfo.FuncNameVar, Builder.getInt64(FuncInfo.FunctionHash),
1018+
{NormalizedNamePtr, Builder.getInt64(FuncInfo.FunctionHash),
10101019
ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
10111020
OpBundles);
10121021
}
@@ -1681,10 +1690,13 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
16811690
IRBuilder<> Builder(&SI);
16821691
Type *Int64Ty = Builder.getInt64Ty();
16831692
auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
1693+
auto *NormalizedFuncNameVarPtr =
1694+
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
1695+
FuncNameVar, PointerType::get(M->getContext(), 0));
16841696
Builder.CreateCall(
16851697
Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
1686-
{FuncNameVar, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
1687-
Builder.getInt32(*CurCtrIdx), Step});
1698+
{NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
1699+
Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
16881700
++(*CurCtrIdx);
16891701
}
16901702

offload/DeviceRTL/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ set(include_files
7777
${include_directory}/Interface.h
7878
${include_directory}/LibC.h
7979
${include_directory}/Mapping.h
80+
${include_directory}/Profiling.h
8081
${include_directory}/State.h
8182
${include_directory}/Synchronization.h
8283
${include_directory}/Types.h
@@ -92,6 +93,7 @@ set(src_files
9293
${source_directory}/Mapping.cpp
9394
${source_directory}/Misc.cpp
9495
${source_directory}/Parallelism.cpp
96+
${source_directory}/Profiling.cpp
9597
${source_directory}/Reduction.cpp
9698
${source_directory}/State.cpp
9799
${source_directory}/Synchronization.cpp

offload/DeviceRTL/include/Profiling.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
#ifndef OMPTARGET_DEVICERTL_PROFILING_H
13+
#define OMPTARGET_DEVICERTL_PROFILING_H
14+
15+
extern "C" {
16+
void __llvm_profile_register_function(void *Ptr);
17+
void __llvm_profile_register_names_function(void *Ptr, long int I);
18+
void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
19+
}
20+
21+
#endif

offload/DeviceRTL/src/Profiling.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
//===------- Profiling.cpp ---------------------------------------- C++ ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "Profiling.h"
10+
11+
#pragma omp begin declare target device_type(nohost)
12+
13+
extern "C" {
14+
15+
// Provides empty implementations for certain functions in compiler-rt
16+
// that are emitted by the PGO instrumentation.
17+
void __llvm_profile_register_function(void *Ptr) {}
18+
void __llvm_profile_register_names_function(void *Ptr, long int I) {}
19+
void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
20+
}
21+
22+
#pragma omp end declare target

offload/plugins-nextgen/common/include/GlobalHandler.h

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
1414
#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
1515

16-
#include <string>
16+
#include <type_traits>
1717

1818
#include "llvm/ADT/DenseMap.h"
1919
#include "llvm/Object/ELFObjectFile.h"
20+
#include "llvm/ProfileData/InstrProf.h"
2021

2122
#include "Shared/Debug.h"
2223
#include "Shared/Utils.h"
@@ -55,6 +56,23 @@ class GlobalTy {
5556
void setPtr(void *P) { Ptr = P; }
5657
};
5758

59+
using IntPtrT = void *;
60+
struct __llvm_profile_data {
61+
#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \
62+
std::remove_const<Type>::type Name;
63+
#include "llvm/ProfileData/InstrProfData.inc"
64+
};
65+
66+
/// PGO profiling data extracted from a GPU device
67+
struct GPUProfGlobals {
68+
SmallVector<uint8_t> NamesData;
69+
SmallVector<SmallVector<int64_t>> Counts;
70+
SmallVector<__llvm_profile_data> Data;
71+
Triple TargetTriple;
72+
73+
void dump() const;
74+
};
75+
5876
/// Subclass of GlobalTy that holds the memory for a global of \p Ty.
5977
template <typename Ty> class StaticGlobalTy : public GlobalTy {
6078
Ty Data;
@@ -164,6 +182,15 @@ class GenericGlobalHandlerTy {
164182
return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
165183
/*D2H=*/false);
166184
}
185+
186+
/// Checks whether a given image contains profiling globals.
187+
bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image);
188+
189+
/// Reads profiling data from a GPU image to supplied profdata struct.
190+
/// Iterates through the image symbol table and stores global values
191+
/// with profiling prefixes.
192+
Expected<GPUProfGlobals> readProfilingGlobals(GenericDeviceTy &Device,
193+
DeviceImageTy &Image);
167194
};
168195

169196
} // namespace plugin

0 commit comments

Comments
 (0)