Skip to content

Commit e286172

Browse files
jdoerfertshiltian
authored andcommitted
[Attributor][AMDGPU] Improve indirect call support in closed modules
If we see all functions that can be called, thus in a "closed world", we can perform better reasoning in the presence of unknown callees of indirect calls. We now collect all indirectly callable functions and limit the potentially called functions to those. The AMDGPU backend is the only user for now. We should enable this for AMDGPU (and NVIDIA GPUs in certain cases) also when we run the Attributor (or OpenMP-opt) earlier in the pipeline.
1 parent 606b50e commit e286172

File tree

10 files changed

+380
-191
lines changed

10 files changed

+380
-191
lines changed

llvm/include/llvm/Transforms/IPO/Attributor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1448,7 +1448,7 @@ struct AttributorConfig {
14481448
/// Callback function to determine if an indirect call targets should be made
14491449
/// direct call targets (with an if-cascade).
14501450
std::function<bool(Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1451-
Function &AssummedCallee)>
1451+
Function &AssummedCallee, unsigned NumCallees)>
14521452
IndirectCalleeSpecializationCallback = nullptr;
14531453

14541454
/// Helper to update an underlying call graph and to delete functions.

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ extern char &AMDGPUMachineCFGStructurizerID;
8989
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
9090

9191
Pass *createAMDGPUAnnotateKernelFeaturesPass();
92-
Pass *createAMDGPUAttributorLegacyPass();
92+
Pass *createAMDGPUAttributorLegacyPass(bool HasWholeProgramVisibility = false);
9393
void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
9494
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
9595
extern char &AMDGPUAnnotateKernelFeaturesID;
@@ -287,8 +287,13 @@ class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
287287
private:
288288
TargetMachine &TM;
289289

290+
/// Asserts whether we can assume whole program visibility during codegen.
291+
bool HasWholeProgramVisibility = false;
292+
290293
public:
291-
AMDGPUAttributorPass(TargetMachine &TM) : TM(TM){};
294+
AMDGPUAttributorPass(TargetMachine &TM,
295+
bool HasWholeProgramVisibility = false)
296+
: TM(TM), HasWholeProgramVisibility(HasWholeProgramVisibility){};
292297
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
293298
};
294299

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/TargetTransformInfo.h"
1718
#include "llvm/CodeGen/TargetPassConfig.h"
19+
#include "llvm/IR/CallingConv.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
1921
#include "llvm/IR/IntrinsicsR600.h"
22+
#include "llvm/Support/Casting.h"
2023
#include "llvm/Target/TargetMachine.h"
2124
#include "llvm/Transforms/IPO/Attributor.h"
25+
#include <optional>
2226

2327
#define DEBUG_TYPE "amdgpu-attributor"
2428

@@ -1023,7 +1027,8 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
10231027
}
10241028
}
10251029

1026-
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
1030+
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1031+
bool HasWholeProgramVisibility) {
10271032
SetVector<Function *> Functions;
10281033
for (Function &F : M) {
10291034
if (!F.isIntrinsic())
@@ -1036,14 +1041,33 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
10361041
DenseSet<const char *> Allowed(
10371042
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
10381043
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1039-
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
1040-
&AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1041-
&AAUnderlyingObjects::ID});
1044+
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID,
1045+
&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
1046+
&AAIndirectCallInfo::ID});
1047+
1048+
/// Helper to decide if we should specialize the indirect \p CB for \p Callee,
1049+
/// which is one of the \p NumCallees potential callees.
1050+
auto IndirectCalleeSpecializationCallback =
1051+
[&](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1052+
Function &Callee, unsigned NumCallees) {
1053+
if (AMDGPU::isEntryFunctionCC(Callee.getCallingConv()))
1054+
return false;
1055+
// Singleton functions should be specialized.
1056+
if (NumCallees == 1)
1057+
return true;
1058+
// Otherewise specialize uniform values.
1059+
const auto &TTI = TM.getTargetTransformInfo(*CB.getCaller());
1060+
return TTI.isAlwaysUniform(CB.getCalledOperand());
1061+
};
10421062

10431063
AttributorConfig AC(CGUpdater);
10441064
AC.Allowed = &Allowed;
10451065
AC.IsModulePass = true;
10461066
AC.DefaultInitializeLiveInternals = false;
1067+
errs() << "HasWholeProgramVisibility " << HasWholeProgramVisibility << "\n";
1068+
AC.IsClosedWorldModule = HasWholeProgramVisibility;
1069+
AC.IndirectCalleeSpecializationCallback =
1070+
IndirectCalleeSpecializationCallback;
10471071
AC.IPOAmendableCB = [](const Function &F) {
10481072
return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
10491073
};
@@ -1070,8 +1094,12 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
10701094
}
10711095

10721096
class AMDGPUAttributorLegacy : public ModulePass {
1097+
/// Asserts whether we can assume whole program visibility during codegen.
1098+
bool HasWholeProgramVisibility = false;
1099+
10731100
public:
1074-
AMDGPUAttributorLegacy() : ModulePass(ID) {}
1101+
AMDGPUAttributorLegacy(bool HasWholeProgramVisibility = false)
1102+
: ModulePass(ID), HasWholeProgramVisibility(HasWholeProgramVisibility) {}
10751103

10761104
/// doInitialization - Virtual method overridden by subclasses to do
10771105
/// any necessary initialization before any pass is run.
@@ -1086,7 +1114,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
10861114

10871115
bool runOnModule(Module &M) override {
10881116
AnalysisGetter AG(this);
1089-
return runImpl(M, AG, *TM);
1117+
return runImpl(M, AG, *TM, HasWholeProgramVisibility);
10901118
}
10911119

10921120
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -1107,14 +1135,15 @@ PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
11071135
AnalysisGetter AG(FAM);
11081136

11091137
// TODO: Probably preserves CFG
1110-
return runImpl(M, AG, TM) ? PreservedAnalyses::none()
1111-
: PreservedAnalyses::all();
1138+
return runImpl(M, AG, TM, HasWholeProgramVisibility)
1139+
? PreservedAnalyses::none()
1140+
: PreservedAnalyses::all();
11121141
}
11131142

11141143
char AMDGPUAttributorLegacy::ID = 0;
11151144

1116-
Pass *llvm::createAMDGPUAttributorLegacyPass() {
1117-
return new AMDGPUAttributorLegacy();
1145+
Pass *llvm::createAMDGPUAttributorLegacyPass(bool HasWholeProgramVisibility) {
1146+
return new AMDGPUAttributorLegacy(HasWholeProgramVisibility);
11181147
}
11191148
INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
11201149
false, false)

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -734,16 +734,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
734734
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
735735
});
736736

737-
// FIXME: Why is AMDGPUAttributor not in CGSCC?
738-
PB.registerOptimizerLastEPCallback(
739-
[this](ModulePassManager &MPM, OptimizationLevel Level) {
740-
if (Level != OptimizationLevel::O0) {
741-
MPM.addPass(AMDGPUAttributorPass(*this));
742-
}
743-
});
744-
745737
PB.registerFullLinkTimeOptimizationLastEPCallback(
746738
[this](ModulePassManager &PM, OptimizationLevel Level) {
739+
if (Level != OptimizationLevel::O0)
740+
PM.addPass(
741+
AMDGPUAttributorPass(*this, /*HasWholeProgramVisibility*/ true));
747742
// We want to support the -lto-partitions=N option as "best effort".
748743
// For that, we need to lower LDS earlier in the pipeline before the
749744
// module is partitioned for codegen.

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll

Lines changed: 64 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,75 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
2+
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope --check-prefixes=SAMEC,CHECK %s
3+
; RUN: llc -global-isel -stop-after=irtranslator -attributor-assume-closed-world -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope --check-prefixes=SAMEC,CWRLD %s
34

45
define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
5-
; CHECK-LABEL: name: test_indirect_call_sgpr_ptr
6-
; CHECK: bb.1 (%ir-block.0):
7-
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
8-
; CHECK-NEXT: {{ $}}
9-
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
10-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
11-
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
12-
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
13-
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
14-
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
15-
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
16-
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
17-
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
18-
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
19-
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.fptr.kernarg.offset1, align 16, addrspace 4)
20-
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
21-
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
22-
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
23-
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
24-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
25-
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
26-
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
27-
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
28-
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
29-
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
30-
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
31-
; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
32-
; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
33-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
34-
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
35-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
36-
; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
37-
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
38-
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
39-
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
40-
; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
41-
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
42-
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
43-
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4)
44-
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
45-
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
46-
; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32)
47-
; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32)
48-
; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32)
49-
; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32)
50-
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
51-
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[LOAD]](p0), 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
52-
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
53-
; CHECK-NEXT: S_ENDPGM 0
6+
; SAMEC-LABEL: name: test_indirect_call_sgpr_ptr
7+
; SAMEC: bb.1 (%ir-block.0):
8+
; SAMEC-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
9+
; SAMEC-NEXT: {{ $}}
10+
; SAMEC-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
11+
; SAMEC-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
12+
; SAMEC-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
13+
; SAMEC-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14
14+
; SAMEC-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13
15+
; SAMEC-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12
16+
; SAMEC-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
17+
; SAMEC-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
18+
; SAMEC-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7
19+
; SAMEC-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
20+
; SAMEC-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.fptr.kernarg.offset1, align 16, addrspace 4)
21+
; SAMEC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
22+
; SAMEC-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
23+
; SAMEC-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
24+
; SAMEC-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4)
25+
; SAMEC-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
26+
; SAMEC-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64)
27+
; SAMEC-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]]
28+
; SAMEC-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]]
29+
; SAMEC-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]]
30+
; SAMEC-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
31+
; SAMEC-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
32+
; SAMEC-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
33+
; SAMEC-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
34+
; SAMEC-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
35+
; SAMEC-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32)
36+
; SAMEC-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]]
37+
; SAMEC-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
38+
; SAMEC-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
39+
; SAMEC-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32)
40+
; SAMEC-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
41+
; SAMEC-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
42+
; SAMEC-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
43+
; SAMEC-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
44+
; SAMEC-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4)
45+
; SAMEC-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
46+
; SAMEC-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64)
47+
; SAMEC-NEXT: $sgpr12 = COPY [[COPY12]](s32)
48+
; SAMEC-NEXT: $sgpr13 = COPY [[COPY13]](s32)
49+
; SAMEC-NEXT: $sgpr14 = COPY [[COPY14]](s32)
50+
; SAMEC-NEXT: $sgpr15 = COPY [[DEF1]](s32)
51+
; SAMEC-NEXT: $vgpr31 = COPY [[OR1]](s32)
52+
; SAMEC-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[LOAD]](p0), 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
53+
; SAMEC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
54+
; SAMEC-NEXT: S_ENDPGM 0
5455
call void %fptr()
5556
ret void
5657
}
5758

5859
define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(ptr %fptr) {
59-
; CHECK-LABEL: name: test_gfx_indirect_call_sgpr_ptr
60-
; CHECK: bb.1 (%ir-block.0):
61-
; CHECK-NEXT: liveins: $vgpr0, $vgpr1
62-
; CHECK-NEXT: {{ $}}
63-
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
64-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
65-
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
66-
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
67-
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
68-
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
69-
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
70-
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
71-
; CHECK-NEXT: SI_RETURN
60+
; SAMEC-LABEL: name: test_gfx_indirect_call_sgpr_ptr
61+
; SAMEC: bb.1 (%ir-block.0):
62+
; SAMEC-NEXT: liveins: $vgpr0, $vgpr1
63+
; SAMEC-NEXT: {{ $}}
64+
; SAMEC-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
65+
; SAMEC-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
66+
; SAMEC-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
67+
; SAMEC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
68+
; SAMEC-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
69+
; SAMEC-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
70+
; SAMEC-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
71+
; SAMEC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
72+
; SAMEC-NEXT: SI_RETURN
7273
call amdgpu_gfx void %fptr()
7374
ret void
7475
}

0 commit comments

Comments
 (0)