Skip to content

Commit afbfd50

Browse files
committed
[Attributor][AMDGPU] Improve indirect call support in closed modules
If we see all functions that can be called, thus in a "closed world", we can perform better reasoning in the presence of unknown callees of indirect calls. We now collect all indirectly callable functions and limit the potentially called functions to those. The AMDGPU backend is the only user for now. We should enable this for AMDGPU (and NVIDIA GPUs in certain cases) also when we run the Attributor (or OpenMP-opt) earlier in the pipeline.
1 parent a986064 commit afbfd50

20 files changed

+2077
-1732
lines changed

llvm/include/llvm/CodeGen/TargetPassConfig.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ class TargetPassConfig : public ImmutablePass {
139139
/// callers.
140140
bool RequireCodeGenSCCOrder = false;
141141

142+
/// Asserts whether we can assume whole program visibility during codegen.
143+
bool HasWholeProgramVisibility = false;
144+
142145
/// Add the actual instruction selection passes. This does not include
143146
/// preparation passes on IR.
144147
bool addCoreISelPasses();
@@ -189,6 +192,13 @@ class TargetPassConfig : public ImmutablePass {
189192
setOpt(RequireCodeGenSCCOrder, Enable);
190193
}
191194

195+
bool getHasWholeProgramVisibility() const {
196+
return HasWholeProgramVisibility;
197+
}
198+
void setHasWholeProgramVisibility(bool Enable) {
199+
setOpt(HasWholeProgramVisibility, Enable);
200+
}
201+
192202
/// Allow the target to override a specific pass without overriding the pass
193203
/// pipeline. When passes are added to the standard pipeline at the
194204
/// point where StandardID is expected, add TargetID in its place.

llvm/include/llvm/Target/TargetMachine.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,8 @@ class TargetMachine {
378378
addPassesToEmitFile(PassManagerBase &, raw_pwrite_stream &,
379379
raw_pwrite_stream *, CodeGenFileType,
380380
bool /*DisableVerify*/ = true,
381-
MachineModuleInfoWrapperPass *MMIWP = nullptr) {
381+
MachineModuleInfoWrapperPass *MMIWP = nullptr,
382+
bool HasWholeProgramVisibility = false) {
382383
return true;
383384
}
384385

@@ -444,11 +445,11 @@ class LLVMTargetMachine : public TargetMachine {
444445
/// emitted. Typically this will involve several steps of code generation.
445446
/// \p MMIWP is an optional parameter that, if set to non-nullptr,
446447
/// will be used to set the MachineModuloInfo for this PM.
447-
bool
448-
addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
449-
raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
450-
bool DisableVerify = true,
451-
MachineModuleInfoWrapperPass *MMIWP = nullptr) override;
448+
bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
449+
raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
450+
bool DisableVerify = true,
451+
MachineModuleInfoWrapperPass *MMIWP = nullptr,
452+
bool HasWholeProgramVisibility = false) override;
452453

453454
virtual Error buildCodeGenPipeline(ModulePassManager &,
454455
MachineFunctionPassManager &,

llvm/include/llvm/Transforms/IPO/Attributor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1447,7 +1447,7 @@ struct AttributorConfig {
14471447
/// Callback function to determine if an indirect call targets should be made
14481448
/// direct call targets (with an if-cascade).
14491449
std::function<bool(Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1450-
Function &AssummedCallee)>
1450+
Function &AssummedCallee, unsigned NumCallees)>
14511451
IndirectCalleeSpecializationCallback = nullptr;
14521452

14531453
/// Helper to update an underlying call graph and to delete functions.

llvm/lib/CodeGen/LLVMTargetMachine.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -114,13 +114,14 @@ LLVMTargetMachine::getTargetTransformInfo(const Function &F) const {
114114
/// addPassesToX helper drives creation and initialization of TargetPassConfig.
115115
static TargetPassConfig *
116116
addPassesToGenerateCode(LLVMTargetMachine &TM, PassManagerBase &PM,
117-
bool DisableVerify,
118-
MachineModuleInfoWrapperPass &MMIWP) {
117+
bool DisableVerify, MachineModuleInfoWrapperPass &MMIWP,
118+
bool HasWholeProgramVisibility) {
119119
// Targets may override createPassConfig to provide a target-specific
120120
// subclass.
121121
TargetPassConfig *PassConfig = TM.createPassConfig(PM);
122122
// Set PassConfig options provided by TargetMachine.
123123
PassConfig->setDisableVerify(DisableVerify);
124+
PassConfig->setHasWholeProgramVisibility(HasWholeProgramVisibility);
124125
PM.add(PassConfig);
125126
PM.add(&MMIWP);
126127

@@ -233,12 +234,12 @@ Expected<std::unique_ptr<MCStreamer>> LLVMTargetMachine::createMCStreamer(
233234
bool LLVMTargetMachine::addPassesToEmitFile(
234235
PassManagerBase &PM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
235236
CodeGenFileType FileType, bool DisableVerify,
236-
MachineModuleInfoWrapperPass *MMIWP) {
237+
MachineModuleInfoWrapperPass *MMIWP, bool HasWholeProgramVisibility) {
237238
// Add common CodeGen passes.
238239
if (!MMIWP)
239240
MMIWP = new MachineModuleInfoWrapperPass(this);
240-
TargetPassConfig *PassConfig =
241-
addPassesToGenerateCode(*this, PM, DisableVerify, *MMIWP);
241+
TargetPassConfig *PassConfig = addPassesToGenerateCode(
242+
*this, PM, DisableVerify, *MMIWP, HasWholeProgramVisibility);
242243
if (!PassConfig)
243244
return true;
244245

@@ -265,8 +266,8 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
265266
bool DisableVerify) {
266267
// Add common CodeGen passes.
267268
MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(this);
268-
TargetPassConfig *PassConfig =
269-
addPassesToGenerateCode(*this, PM, DisableVerify, *MMIWP);
269+
TargetPassConfig *PassConfig = addPassesToGenerateCode(
270+
*this, PM, DisableVerify, *MMIWP, /*HasWholeProgramVisibility=*/false);
270271
if (!PassConfig)
271272
return true;
272273
assert(TargetPassConfig::willCompleteCodeGenPipeline() &&

llvm/lib/LTO/LTOBackend.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,9 @@ static void codegen(const Config &Conf, TargetMachine *TM,
416416
if (Conf.PreCodeGenPassesHook)
417417
Conf.PreCodeGenPassesHook(CodeGenPasses);
418418
if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS,
419-
DwoOut ? &DwoOut->os() : nullptr,
420-
Conf.CGFileType))
419+
DwoOut ? &DwoOut->os() : nullptr, Conf.CGFileType,
420+
/*DisableVerify=*/true, /*MMIWP=*/nullptr,
421+
Conf.HasWholeProgramVisibility))
421422
report_fatal_error("Failed to setup codegen");
422423
CodeGenPasses.run(Mod);
423424

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ extern char &AMDGPUMachineCFGStructurizerID;
8888
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
8989

9090
Pass *createAMDGPUAnnotateKernelFeaturesPass();
91-
Pass *createAMDGPUAttributorLegacyPass();
91+
Pass *createAMDGPUAttributorLegacyPass(bool HasWholeProgramVisibility = false);
9292
void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
9393
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
9494
extern char &AMDGPUAnnotateKernelFeaturesID;
@@ -271,8 +271,13 @@ class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
271271
private:
272272
TargetMachine &TM;
273273

274+
/// Asserts whether we can assume whole program visibility during codegen.
275+
bool HasWholeProgramVisibility = false;
276+
274277
public:
275-
AMDGPUAttributorPass(TargetMachine &TM) : TM(TM){};
278+
AMDGPUAttributorPass(TargetMachine &TM,
279+
bool HasWholeProgramVisibility = false)
280+
: TM(TM), HasWholeProgramVisibility(HasWholeProgramVisibility){};
276281
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
277282
};
278283

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/TargetTransformInfo.h"
1718
#include "llvm/CodeGen/TargetPassConfig.h"
19+
#include "llvm/IR/CallingConv.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
1921
#include "llvm/IR/IntrinsicsR600.h"
22+
#include "llvm/Support/Casting.h"
2023
#include "llvm/Target/TargetMachine.h"
2124
#include "llvm/Transforms/IPO/Attributor.h"
25+
#include <optional>
2226

2327
#define DEBUG_TYPE "amdgpu-attributor"
2428

@@ -933,7 +937,8 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
933937
}
934938
}
935939

936-
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
940+
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
941+
bool HasWholeProgramVisibility) {
937942
SetVector<Function *> Functions;
938943
for (Function &F : M) {
939944
if (!F.isIntrinsic())
@@ -947,12 +952,31 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
947952
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
948953
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
949954
&AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
950-
&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
955+
&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
956+
&AAIndirectCallInfo::ID});
957+
958+
/// Helper to decide if we should specialize the indirect \p CB for \p Callee,
959+
/// which is one of the \p NumCallees potential callees.
960+
auto IndirectCalleeSpecializationCallback =
961+
[&](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
962+
Function &Callee, unsigned NumCallees) {
963+
if (AMDGPU::isEntryFunctionCC(Callee.getCallingConv()))
964+
return false;
965+
// Singleton functions should be specialized.
966+
if (NumCallees == 1)
967+
return true;
968+
// Otherewise specialize uniform values.
969+
const auto &TTI = TM.getTargetTransformInfo(*CB.getCaller());
970+
return TTI.isAlwaysUniform(CB.getCalledOperand());
971+
};
951972

952973
AttributorConfig AC(CGUpdater);
953974
AC.Allowed = &Allowed;
954975
AC.IsModulePass = true;
955976
AC.DefaultInitializeLiveInternals = false;
977+
AC.IsClosedWorldModule = HasWholeProgramVisibility;
978+
AC.IndirectCalleeSpecializationCallback =
979+
IndirectCalleeSpecializationCallback;
956980
AC.IPOAmendableCB = [](const Function &F) {
957981
return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
958982
};
@@ -978,8 +1002,12 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
9781002
}
9791003

9801004
class AMDGPUAttributorLegacy : public ModulePass {
1005+
/// Asserts whether we can assume whole program visibility during codegen.
1006+
bool HasWholeProgramVisibility = false;
1007+
9811008
public:
982-
AMDGPUAttributorLegacy() : ModulePass(ID) {}
1009+
AMDGPUAttributorLegacy(bool HasWholeProgramVisibility = false)
1010+
: ModulePass(ID), HasWholeProgramVisibility(HasWholeProgramVisibility) {}
9831011

9841012
/// doInitialization - Virtual method overridden by subclasses to do
9851013
/// any necessary initialization before any pass is run.
@@ -994,7 +1022,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
9941022

9951023
bool runOnModule(Module &M) override {
9961024
AnalysisGetter AG(this);
997-
return runImpl(M, AG, *TM);
1025+
return runImpl(M, AG, *TM, HasWholeProgramVisibility);
9981026
}
9991027

10001028
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -1015,14 +1043,15 @@ PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
10151043
AnalysisGetter AG(FAM);
10161044

10171045
// TODO: Probably preserves CFG
1018-
return runImpl(M, AG, TM) ? PreservedAnalyses::none()
1019-
: PreservedAnalyses::all();
1046+
return runImpl(M, AG, TM, HasWholeProgramVisibility)
1047+
? PreservedAnalyses::none()
1048+
: PreservedAnalyses::all();
10201049
}
10211050

10221051
char AMDGPUAttributorLegacy::ID = 0;
10231052

1024-
Pass *llvm::createAMDGPUAttributorLegacyPass() {
1025-
return new AMDGPUAttributorLegacy();
1053+
Pass *llvm::createAMDGPUAttributorLegacyPass(bool HasWholeProgramVisibility) {
1054+
return new AMDGPUAttributorLegacy(HasWholeProgramVisibility);
10261055
}
10271056
INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
10281057
false, false)

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
#include "llvm/InitializePasses.h"
5151
#include "llvm/MC/TargetRegistry.h"
5252
#include "llvm/Passes/PassBuilder.h"
53+
#include "llvm/Support/Signals.h"
5354
#include "llvm/Transforms/HipStdPar/HipStdPar.h"
5455
#include "llvm/Transforms/IPO.h"
5556
#include "llvm/Transforms/IPO/AlwaysInliner.h"
@@ -625,7 +626,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
625626
[this](StringRef PassName, ModulePassManager &PM,
626627
ArrayRef<PassBuilder::PipelineElement>) {
627628
if (PassName == "amdgpu-attributor") {
628-
PM.addPass(AMDGPUAttributorPass(*this));
629+
PM.addPass(AMDGPUAttributorPass(*this, HasWholeProgramVisibility));
629630
return true;
630631
}
631632
if (PassName == "amdgpu-unify-metadata") {
@@ -1004,7 +1005,8 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
10041005
}
10051006

10061007
void AMDGPUPassConfig::addIRPasses() {
1007-
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1008+
AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1009+
TM.HasWholeProgramVisibility = getHasWholeProgramVisibility();
10081010

10091011
Triple::ArchType Arch = TM.getTargetTriple().getArch();
10101012
if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
@@ -1041,7 +1043,7 @@ void AMDGPUPassConfig::addIRPasses() {
10411043
// AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
10421044
// after their introduction
10431045
if (TM.getOptLevel() > CodeGenOptLevel::None)
1044-
addPass(createAMDGPUAttributorLegacyPass());
1046+
addPass(createAMDGPUAttributorLegacyPass(HasWholeProgramVisibility));
10451047

10461048
if (TM.getOptLevel() > CodeGenOptLevel::None)
10471049
addPass(createInferAddressSpacesPass());

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
3838
static bool EnableFunctionCalls;
3939
static bool EnableLowerModuleLDS;
4040

41+
/// Asserts whether we can assume whole program visibility during codegen.
42+
bool HasWholeProgramVisibility = false;
43+
4144
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
4245
StringRef FS, TargetOptions Options,
4346
std::optional<Reloc::Model> RM,

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
2+
; RUN: llc -global-isel -stop-after=irtranslator -attributor-assume-closed-world=false -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope --check-prefixes=SAMEC,CHECK %s
3+
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope --check-prefixes=SAMEC,CWRLD %s
34

45
define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
56
; CHECK-LABEL: name: test_indirect_call_sgpr_ptr
@@ -52,24 +53,31 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
5253
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[LOAD]](p0), 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
5354
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
5455
; CHECK-NEXT: S_ENDPGM 0
56+
;
57+
; CWRLD-LABEL: name: test_indirect_call_sgpr_ptr
58+
; CWRLD: bb.1 (%ir-block.0):
59+
; CWRLD-NEXT: liveins: $sgpr4_sgpr5
60+
; CWRLD-NEXT: {{ $}}
61+
; CWRLD-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
62+
; CWRLD-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
5563
call void %fptr()
5664
ret void
5765
}
5866

5967
define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(ptr %fptr) {
60-
; CHECK-LABEL: name: test_gfx_indirect_call_sgpr_ptr
61-
; CHECK: bb.1 (%ir-block.0):
62-
; CHECK-NEXT: liveins: $vgpr0, $vgpr1
63-
; CHECK-NEXT: {{ $}}
64-
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
65-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
66-
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
67-
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
68-
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
69-
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
70-
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
71-
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
72-
; CHECK-NEXT: SI_RETURN
68+
; SAMEC-LABEL: name: test_gfx_indirect_call_sgpr_ptr
69+
; SAMEC: bb.1 (%ir-block.0):
70+
; SAMEC-NEXT: liveins: $vgpr0, $vgpr1
71+
; SAMEC-NEXT: {{ $}}
72+
; SAMEC-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
73+
; SAMEC-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
74+
; SAMEC-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
75+
; SAMEC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
76+
; SAMEC-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
77+
; SAMEC-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
78+
; SAMEC-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
79+
; SAMEC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
80+
; SAMEC-NEXT: SI_RETURN
7381
call amdgpu_gfx void %fptr()
7482
ret void
7583
}

0 commit comments

Comments
 (0)