Skip to content

Commit c01e844

Browse files
jayfoadkzhuravl
andauthored
[AMDGPU] Update compute program resource registers for GFX12 (#75911)
Co-authored-by: Konstantin Zhuravlyov <[email protected]>
1 parent 687c51a commit c01e844

File tree

8 files changed

+251
-50
lines changed

8 files changed

+251
-50
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4406,7 +4406,15 @@ The fields used by CP for code objects before V3 also match those specified in
44064406
``COMPUTE_PGM_RSRC3``
44074407
configuration
44084408
register. See
4409-
:ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx12-table`.
4409+
:ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table`.
4410+
GFX12
4411+
Compute Shader (CS)
4412+
program settings used by
4413+
CP to set up
4414+
``COMPUTE_PGM_RSRC3``
4415+
configuration
4416+
register. See
4417+
:ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx12-table`.
44104418
415:384 4 bytes COMPUTE_PGM_RSRC1 Compute Shader (CS)
44114419
program settings used by
44124420
CP to set up
@@ -4831,13 +4839,16 @@ The fields used by CP for code objects before V3 also match those specified in
48314839

48324840
Used by CP to set up
48334841
``COMPUTE_PGM_RSRC2.USER_SGPR``.
4834-
6 1 bit ENABLE_TRAP_HANDLER Must be 0.
4842+
6 1 bit ENABLE_TRAP_HANDLER GFX6-GFX11
4843+
Must be 0.
48354844

4836-
This bit represents
4837-
``COMPUTE_PGM_RSRC2.TRAP_PRESENT``,
4838-
which is set by the CP if
4839-
the runtime has installed a
4840-
trap handler.
4845+
This bit represents
4846+
``COMPUTE_PGM_RSRC2.TRAP_PRESENT``,
4847+
which is set by the CP if
4848+
the runtime has installed a
4849+
trap handler.
4850+
GFX12
4851+
Reserved, must be 0.
48414852
7 1 bit ENABLE_SGPR_WORKGROUP_ID_X Enable the setup of the
48424853
system SGPR register for
48434854
the work-group id in the X
@@ -4957,7 +4968,7 @@ The fields used by CP for code objects before V3 also match those specified in
49574968
30 1 bit ENABLE_EXCEPTION_INT_DIVIDE_BY Integer Division by Zero
49584969
_ZERO (rcp_iflag_f32 instruction
49594970
only)
4960-
31 1 bit Reserved, must be 0.
4971+
31 1 bit RESERVED Reserved, must be 0.
49614972
32 **Total size 4 bytes.**
49624973
======= ===================================================================================================================
49634974

@@ -4986,8 +4997,8 @@ The fields used by CP for code objects before V3 also match those specified in
49864997

49874998
..
49884999

4989-
.. table:: compute_pgm_rsrc3 for GFX10-GFX12
4990-
:name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx12-table
5000+
.. table:: compute_pgm_rsrc3 for GFX10-GFX11
5001+
:name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table
49915002

49925003
======= ======= =============================== ===========================================================================
49935004
Bits Size Field Name Description
@@ -5036,6 +5047,32 @@ The fields used by CP for code objects before V3 also match those specified in
50365047
32 **Total size 4 bytes.**
50375048
======= ===================================================================================================================
50385049

5050+
..
5051+
5052+
.. table:: compute_pgm_rsrc3 for GFX12
5053+
:name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx12-table
5054+
5055+
======= ======= =============================== ===========================================================================
5056+
Bits Size Field Name Description
5057+
======= ======= =============================== ===========================================================================
5058+
3:0 4 bits RESERVED Reserved, must be 0.
5059+
11:4 8 bits INST_PREF_SIZE Number of instruction bytes to prefetch, starting at the kernel's entry
5060+
point instruction, before wavefront starts execution. The value is 0..255
5061+
with a granularity of 128 bytes.
5062+
12 1 bit RESERVED Reserved, must be 0.
5063+
13 1 bit GLG_EN If 1, group launch guarantee will be enabled for this dispatch
5064+
30:14 17 bits RESERVED Reserved, must be 0.
5065+
31 1 bit IMAGE_OP If 1, the kernel execution contains image instructions. If executed as
5066+
part of a graphics pipeline, image read instructions will stall waiting
5067+
for any necessary ``WAIT_SYNC`` fence to be performed in order to
5068+
indicate that earlier pipeline stages have completed writing to the
5069+
image.
5070+
5071+
Not used for compute kernels that are not part of a graphics pipeline and
5072+
must be 0.
5073+
32 **Total size 4 bytes.**
5074+
======= ===================================================================================================================
5075+
50395076
..
50405077

50415078
.. table:: Floating Point Rounding Mode Enumeration Values
@@ -15508,7 +15545,7 @@ terminated by an ``.end_amdhsa_kernel`` directive.
1550815545
``.amdhsa_forward_progress`` 0 GFX10-GFX12 Controls FWD_PROGRESS in
1550915546
:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`.
1551015547
``.amdhsa_shared_vgpr_count`` 0 GFX10-GFX11 Controls SHARED_VGPR_COUNT in
15511-
:ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx12-table`.
15548+
:ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table`.
1551215549
``.amdhsa_exception_fp_ieee_invalid_op`` 0 GFX6-GFX12 Controls ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION in
1551315550
:ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`.
1551415551
``.amdhsa_exception_fp_denorm_src`` 0 GFX6-GFX12 Controls ENABLE_EXCEPTION_FP_DENORMAL_SOURCE in

llvm/include/llvm/Support/AMDHSAKernelDescriptor.h

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,20 @@ enum : int32_t {
127127
#undef COMPUTE_PGM_RSRC1
128128

129129
// Compute program resource register 2. Must match hardware definition.
130+
// GFX6+.
130131
#define COMPUTE_PGM_RSRC2(NAME, SHIFT, WIDTH) \
131132
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_ ## NAME, SHIFT, WIDTH)
133+
// [GFX6-GFX11].
134+
#define COMPUTE_PGM_RSRC2_GFX6_GFX11(NAME, SHIFT, WIDTH) \
135+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX6_GFX11_##NAME, SHIFT, WIDTH)
136+
// GFX12+.
137+
#define COMPUTE_PGM_RSRC2_GFX12_PLUS(NAME, SHIFT, WIDTH) \
138+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX12_PLUS_##NAME, SHIFT, WIDTH)
132139
enum : int32_t {
133140
COMPUTE_PGM_RSRC2(ENABLE_PRIVATE_SEGMENT, 0, 1),
134141
COMPUTE_PGM_RSRC2(USER_SGPR_COUNT, 1, 5),
135-
COMPUTE_PGM_RSRC2(ENABLE_TRAP_HANDLER, 6, 1),
142+
COMPUTE_PGM_RSRC2_GFX6_GFX11(ENABLE_TRAP_HANDLER, 6, 1),
143+
COMPUTE_PGM_RSRC2_GFX12_PLUS(RESERVED1, 6, 1),
136144
COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
137145
COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1),
138146
COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1),
@@ -166,23 +174,37 @@ enum : int32_t {
166174

167175
// Compute program resource register 3 for GFX10+. Must match hardware
168176
// definition.
169-
// [GFX10].
170-
#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH) \
171-
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_ ## NAME, SHIFT, WIDTH)
172177
// GFX10+.
173178
#define COMPUTE_PGM_RSRC3_GFX10_PLUS(NAME, SHIFT, WIDTH) \
174179
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_PLUS_ ## NAME, SHIFT, WIDTH)
180+
// [GFX10].
181+
#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH) \
182+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_##NAME, SHIFT, WIDTH)
183+
// [GFX10-GFX11].
184+
#define COMPUTE_PGM_RSRC3_GFX10_GFX11(NAME, SHIFT, WIDTH) \
185+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX11_##NAME, SHIFT, WIDTH)
175186
// GFX11+.
176187
#define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
177188
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_ ## NAME, SHIFT, WIDTH)
189+
// [GFX11].
190+
#define COMPUTE_PGM_RSRC3_GFX11(NAME, SHIFT, WIDTH) \
191+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_##NAME, SHIFT, WIDTH)
192+
// GFX12+.
193+
#define COMPUTE_PGM_RSRC3_GFX12_PLUS(NAME, SHIFT, WIDTH) \
194+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX12_PLUS_##NAME, SHIFT, WIDTH)
178195
enum : int32_t {
179-
COMPUTE_PGM_RSRC3_GFX10_PLUS(SHARED_VGPR_COUNT, 0, 4),
180-
COMPUTE_PGM_RSRC3_GFX10(RESERVED0, 4, 8),
181-
COMPUTE_PGM_RSRC3_GFX11_PLUS(INST_PREF_SIZE, 4, 6),
182-
COMPUTE_PGM_RSRC3_GFX11_PLUS(TRAP_ON_START, 10, 1),
183-
COMPUTE_PGM_RSRC3_GFX11_PLUS(TRAP_ON_END, 11, 1),
184-
COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED1, 12, 19),
185-
COMPUTE_PGM_RSRC3_GFX10(RESERVED2, 31, 1),
196+
COMPUTE_PGM_RSRC3_GFX10_GFX11(SHARED_VGPR_COUNT, 0, 4),
197+
COMPUTE_PGM_RSRC3_GFX12_PLUS(RESERVED0, 0, 4),
198+
COMPUTE_PGM_RSRC3_GFX10(RESERVED1, 4, 8),
199+
COMPUTE_PGM_RSRC3_GFX11(INST_PREF_SIZE, 4, 6),
200+
COMPUTE_PGM_RSRC3_GFX11(TRAP_ON_START, 10, 1),
201+
COMPUTE_PGM_RSRC3_GFX11(TRAP_ON_END, 11, 1),
202+
COMPUTE_PGM_RSRC3_GFX12_PLUS(INST_PREF_SIZE, 4, 8),
203+
COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED2, 12, 1),
204+
COMPUTE_PGM_RSRC3_GFX10_GFX11(RESERVED3, 13, 1),
205+
COMPUTE_PGM_RSRC3_GFX12_PLUS(GLG_EN, 13, 1),
206+
COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED4, 14, 17),
207+
COMPUTE_PGM_RSRC3_GFX10(RESERVED5, 31, 1),
186208
COMPUTE_PGM_RSRC3_GFX11_PLUS(IMAGE_OP, 31, 1),
187209
};
188210
#undef COMPUTE_PGM_RSRC3_GFX10_PLUS

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5416,11 +5416,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
54165416
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Val,
54175417
ValRange);
54185418
} else if (ID == ".amdhsa_shared_vgpr_count") {
5419-
if (IVersion.Major < 10)
5420-
return Error(IDRange.Start, "directive requires gfx10+", IDRange);
5419+
if (IVersion.Major < 10 || IVersion.Major >= 12)
5420+
return Error(IDRange.Start, "directive requires gfx10 or gfx11",
5421+
IDRange);
54215422
SharedVGPRCount = Val;
54225423
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3,
5423-
COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT, Val,
5424+
COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT, Val,
54245425
ValRange);
54255426
} else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
54265427
PARSE_BITS_ENTRY(
@@ -5522,7 +5523,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
55225523
(AccumOffset / 4 - 1));
55235524
}
55245525

5525-
if (IVersion.Major >= 10) {
5526+
if (IVersion.Major >= 10 && IVersion.Major < 12) {
55265527
// SharedVGPRCount < 16 checked by PARSE_ENTRY_BITS
55275528
if (SharedVGPRCount && EnableWavefrontSize32 && *EnableWavefrontSize32) {
55285529
return TokError("shared_vgpr_count directive not valid on "

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1999,34 +1999,60 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
19991999
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED1)
20002000
return MCDisassembler::Fail;
20012001
} else if (isGFX10Plus()) {
2002-
if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
2003-
PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
2004-
COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
2002+
// Bits [0-3].
2003+
if (!isGFX12Plus()) {
2004+
if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
2005+
PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
2006+
COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
2007+
} else {
2008+
PRINT_PSEUDO_DIRECTIVE_COMMENT(
2009+
"SHARED_VGPR_COUNT",
2010+
COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
2011+
}
20052012
} else {
2006-
PRINT_PSEUDO_DIRECTIVE_COMMENT(
2007-
"SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
2013+
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0)
2014+
return MCDisassembler::Fail;
20082015
}
20092016

2010-
if (isGFX11Plus()) {
2017+
// Bits [4-11].
2018+
if (isGFX11()) {
20112019
PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
2012-
COMPUTE_PGM_RSRC3_GFX11_PLUS_INST_PREF_SIZE);
2020+
COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE);
20132021
PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
2014-
COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
2022+
COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START);
20152023
PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
2016-
COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_END);
2024+
COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END);
2025+
} else if (isGFX12Plus()) {
2026+
PRINT_PSEUDO_DIRECTIVE_COMMENT(
2027+
"INST_PREF_SIZE", COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE);
2028+
} else {
2029+
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED1)
2030+
return MCDisassembler::Fail;
2031+
}
2032+
2033+
// Bits [12].
2034+
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2)
2035+
return MCDisassembler::Fail;
2036+
2037+
// Bits [13].
2038+
if (isGFX12Plus()) {
2039+
PRINT_PSEUDO_DIRECTIVE_COMMENT("GLG_EN",
2040+
COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN);
20172041
} else {
2018-
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED0)
2042+
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3)
20192043
return MCDisassembler::Fail;
20202044
}
20212045

2022-
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED1)
2046+
// Bits [14-30].
2047+
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4)
20232048
return MCDisassembler::Fail;
20242049

2050+
// Bits [31].
20252051
if (isGFX11Plus()) {
20262052
PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
2027-
COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
2053+
COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP);
20282054
} else {
2029-
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED2)
2055+
if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED5)
20302056
return MCDisassembler::Fail;
20312057
}
20322058
} else if (FourByteBuffer) {

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,8 +475,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
475475
PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
476476
compute_pgm_rsrc1,
477477
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
478+
}
479+
if (IVersion.Major >= 10 && IVersion.Major < 12) {
478480
PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3,
479-
amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
481+
amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
480482
}
481483
if (IVersion.Major >= 12)
482484
PRINT_FIELD(OS, ".amdhsa_round_robin_scheduling", KD, compute_pgm_rsrc1,

0 commit comments

Comments
 (0)