Skip to content

Commit 7bab863

Browse files
kerbowabcahoon
authored andcommitted
[AMDGPU] Add DAG ISel support for preloaded kernel arguments
This patch adds the DAG isel changes for kernel argument preloading. These changes are not usable with older firmware but subsequent patches in the series will make the codegen backwards compatible. This patch should only be submitted alongside that subsequent patch. Preloading here begins from the start of the kernel arguments until the amount of arguments indicated by the CL flag amdgpu-kernarg-preload-count. Aggregates and arguments passed by-ref are not supported. Special care for the alignment of the kernarg segment is needed as well as consideration of the alignment of addressable SGPR tuples when we cannot directly use misaligned large tuples that the arguments are loaded to. Reviewed By: bcahoon Differential Revision: https://reviews.llvm.org/D158579 Change-Id: I43203024e59bcb01f29bc1b6cb78a94eaa91476a
1 parent aefa7dc commit 7bab863

16 files changed

+5778
-74
lines changed

llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) {
6060
return false;
6161
}
6262

63+
// TODO: Print preload kernargs?
6364
void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
6465
for (const auto &FI : ArgInfoMap) {
6566
OS << "Arguments for " << FI.first->getName() << '\n'
@@ -148,7 +149,7 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
148149
llvm_unreachable("unexpected preloaded value type");
149150
}
150151

151-
constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
152+
AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
152153
AMDGPUFunctionArgInfo AI;
153154
AI.PrivateSegmentBuffer
154155
= ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3);

llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
1010
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
1111

12+
#include "llvm/ADT/DenseMap.h"
1213
#include "llvm/CodeGen/Register.h"
1314
#include "llvm/Pass.h"
1415

@@ -37,22 +38,19 @@ struct ArgDescriptor {
3738
bool IsSet : 1;
3839

3940
public:
40-
constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41-
bool IsStack = false, bool IsSet = false)
42-
: Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
41+
ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false,
42+
bool IsSet = false)
43+
: Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
4344

44-
static constexpr ArgDescriptor createRegister(Register Reg,
45-
unsigned Mask = ~0u) {
45+
static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
4646
return ArgDescriptor(Reg, Mask, false, true);
4747
}
4848

49-
static constexpr ArgDescriptor createStack(unsigned Offset,
50-
unsigned Mask = ~0u) {
49+
static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
5150
return ArgDescriptor(Offset, Mask, true, true);
5251
}
5352

54-
static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55-
unsigned Mask) {
53+
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
5654
return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
5755
}
5856

@@ -94,6 +92,11 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
9492
return OS;
9593
}
9694

95+
struct KernArgPreloadDescriptor : public ArgDescriptor {
96+
KernArgPreloadDescriptor() {}
97+
SmallVector<MCRegister> Regs;
98+
};
99+
97100
struct AMDGPUFunctionArgInfo {
98101
enum PreloadedValue {
99102
// SGPRS:
@@ -151,10 +154,13 @@ struct AMDGPUFunctionArgInfo {
151154
ArgDescriptor WorkItemIDY;
152155
ArgDescriptor WorkItemIDZ;
153156

157+
// Map the index of preloaded kernel arguments to its descriptor.
158+
SmallDenseMap<int, KernArgPreloadDescriptor> PreloadKernArgs{};
159+
154160
std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
155161
getPreloadedValue(PreloadedValue Value) const;
156162

157-
static constexpr AMDGPUFunctionArgInfo fixedABILayout();
163+
static AMDGPUFunctionArgInfo fixedABILayout();
158164
};
159165

160166
class AMDGPUArgumentUsageInfo : public ImmutablePass {

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,11 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
219219

220220
if (STM.isAmdHsaOS())
221221
HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
222+
223+
if (MFI.getNumKernargPreloadedSGPRs() > 0) {
224+
assert(AMDGPU::hasKernargPreload(STM));
225+
getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI());
226+
}
222227
}
223228

224229
void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
@@ -436,6 +441,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
436441
const SIProgramInfo &PI) const {
437442
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
438443
const Function &F = MF.getFunction();
444+
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
439445

440446
amdhsa::kernel_descriptor_t KernelDescriptor;
441447
memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
@@ -459,6 +465,10 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
459465
KernelDescriptor.compute_pgm_rsrc3 =
460466
CurrentProgramInfo.ComputePGMRSrc3GFX90A;
461467

468+
if (AMDGPU::hasKernargPreload(STM))
469+
KernelDescriptor.kernarg_preload =
470+
static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
471+
462472
return KernelDescriptor;
463473
}
464474

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,7 +1002,8 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
10021002
}
10031003

10041004
GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
1005-
const GCNSubtarget &ST) {
1005+
const GCNSubtarget &ST)
1006+
: ST(ST) {
10061007
const CallingConv::ID CC = F.getCallingConv();
10071008
const bool IsKernel =
10081009
CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
@@ -1043,30 +1044,35 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
10431044
!ST.flatScratchIsArchitected()) {
10441045
FlatScratchInit = true;
10451046
}
1046-
}
10471047

1048-
unsigned GCNUserSGPRUsageInfo::getNumUsedUserSGPRs() const {
1049-
unsigned NumUserSGPRs = 0;
10501048
if (hasImplicitBufferPtr())
1051-
NumUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
1049+
NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
10521050

10531051
if (hasPrivateSegmentBuffer())
1054-
NumUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
1052+
NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
10551053

10561054
if (hasDispatchPtr())
1057-
NumUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
1055+
NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
10581056

10591057
if (hasQueuePtr())
1060-
NumUserSGPRs += getNumUserSGPRForField(QueuePtrID);
1058+
NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
10611059

10621060
if (hasKernargSegmentPtr())
1063-
NumUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
1061+
NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
10641062

10651063
if (hasDispatchID())
1066-
NumUserSGPRs += getNumUserSGPRForField(DispatchIdID);
1064+
NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
10671065

10681066
if (hasFlatScratchInit())
1069-
NumUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1067+
NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1068+
}
1069+
1070+
void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
1071+
assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1072+
NumKernargPreloadSGPRs += NumSGPRs;
1073+
NumUsedUserSGPRs += NumSGPRs;
1074+
}
10701075

1071-
return NumUserSGPRs;
1076+
unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
1077+
return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
10721078
}

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,8 +1383,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13831383

13841384
class GCNUserSGPRUsageInfo {
13851385
public:
1386-
unsigned getNumUsedUserSGPRs() const;
1387-
13881386
bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
13891387

13901388
bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
@@ -1399,6 +1397,14 @@ class GCNUserSGPRUsageInfo {
13991397

14001398
bool hasFlatScratchInit() const { return FlatScratchInit; }
14011399

1400+
unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1401+
1402+
unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1403+
1404+
unsigned getNumFreeUserSGPRs();
1405+
1406+
void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1407+
14021408
enum UserSGPRID : unsigned {
14031409
ImplicitBufferPtrID = 0,
14041410
PrivateSegmentBufferID = 1,
@@ -1436,6 +1442,8 @@ class GCNUserSGPRUsageInfo {
14361442
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
14371443

14381444
private:
1445+
const GCNSubtarget &ST;
1446+
14391447
// Private memory buffer
14401448
// Compute directly in sgpr[0:1]
14411449
// Other shaders indirect 64-bits at sgpr[0:1]
@@ -1452,6 +1460,10 @@ class GCNUserSGPRUsageInfo {
14521460
bool DispatchID = false;
14531461

14541462
bool FlatScratchInit = false;
1463+
1464+
unsigned NumKernargPreloadSGPRs = 0;
1465+
1466+
unsigned NumUsedUserSGPRs = 0;
14551467
};
14561468

14571469
} // end namespace llvm

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -829,6 +829,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
829829
return true;
830830
}
831831

832+
bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
833+
const MCSubtargetInfo &STI) {
834+
for (int i = 0; i < 64; ++i) {
835+
OS << "\ts_nop 0\n";
836+
}
837+
return true;
838+
}
839+
840+
bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader(
841+
const MCSubtargetInfo &STI) {
842+
const uint32_t Encoded_s_nop = 0xbf800000;
843+
MCStreamer &OS = getStreamer();
844+
for (int i = 0; i < 64; ++i) {
845+
OS.emitInt32(Encoded_s_nop);
846+
}
847+
return true;
848+
}
849+
832850
bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
833851
const uint32_t Encoded_s_code_end = 0xbf9f0000;
834852
const uint32_t Encoded_s_nop = 0xbf800000;

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
9090
/// \returns True on success, false on failure.
9191
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; }
9292

93+
/// \returns True on success, false on failure.
94+
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) {
95+
return true;
96+
}
97+
9398
virtual void EmitAmdhsaKernelDescriptor(
9499
const MCSubtargetInfo &STI, StringRef KernelName,
95100
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
@@ -154,6 +159,9 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
154159
/// \returns True on success, false on failure.
155160
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
156161

162+
/// \returns True on success, false on failure.
163+
bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
164+
157165
void EmitAmdhsaKernelDescriptor(
158166
const MCSubtargetInfo &STI, StringRef KernelName,
159167
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
@@ -215,6 +223,9 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
215223
/// \returns True on success, false on failure.
216224
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
217225

226+
/// \returns True on success, false on failure.
227+
bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
228+
218229
void EmitAmdhsaKernelDescriptor(
219230
const MCSubtargetInfo &STI, StringRef KernelName,
220231
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,

0 commit comments

Comments
 (0)