Skip to content

Commit c1fabd6

Browse files
authored
[llvm][AMDGPU] Enable FWD_PROGRESS bit for GFX10+ (llvm#128367)
From GFX10 onwards it is possible to employ benevolent scheduling of waves. This patch unconditionally enables, for the `amdhsa` OS, the bit which controls that capability, as it is beneficial for algorithms that rely on more complex concurrent coordination and it is generally performance neutral otherwise.
1 parent 541b8f2 commit c1fabd6

File tree

14 files changed

+63
-47
lines changed

14 files changed

+63
-47
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18208,7 +18208,7 @@ terminated by an ``.end_amdhsa_kernel`` directive.
1820818208
(cumode)
1820918209
``.amdhsa_memory_ordered`` 1 GFX10-GFX12 Controls MEM_ORDERED in
1821018210
:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`.
18211-
``.amdhsa_forward_progress`` 0 GFX10-GFX12 Controls FWD_PROGRESS in
18211+
``.amdhsa_forward_progress`` 1 GFX10-GFX12 Controls FWD_PROGRESS in
1821218212
:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`.
1821318213
``.amdhsa_shared_vgpr_count`` 0 GFX10-GFX11 Controls SHARED_VGPR_COUNT in
1821418214
:ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table`.

llvm/docs/ReleaseNotes.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ Changes to the AArch64 Backend
8686
Changes to the AMDGPU Backend
8787
-----------------------------
8888

89+
* Enabled the
90+
[FWD_PROGRESS bit](https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor)
91+
for all GFX ISAs greater or equal to 10, for the AMDHSA OS.
92+
8993
Changes to the ARM Backend
9094
--------------------------
9195

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
11971197
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
11981198
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
11991199
ProgInfo.MemOrdered = 1;
1200+
ProgInfo.FwdProgress = 1;
12001201
}
12011202

12021203
// 0 = X, 1 = XY, 2 = XYZ

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@ MCKernelDescriptor::getDefaultAmdhsaKernelDescriptor(const MCSubtargetInfo *STI,
7070
KD.compute_pgm_rsrc1, OneMCExpr,
7171
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
7272
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, Ctx);
73+
74+
MCKernelDescriptor::bits_set(
75+
KD.compute_pgm_rsrc1, OneMCExpr,
76+
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS_SHIFT,
77+
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Ctx);
7378
}
7479
if (AMDGPU::isGFX90A(*STI) && STI->getFeatureBits().test(FeatureTgSplit))
7580
MCKernelDescriptor::bits_set(

llvm/lib/Target/AMDGPU/SIProgramInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
3939
IEEEMode = 0;
4040
WgpMode = 0;
4141
MemOrdered = 0;
42+
FwdProgress = 0;
4243
RrWgMode = 0;
4344
ScratchSize = ZeroExpr;
4445

@@ -92,6 +93,10 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
9293
if (ST.hasIEEEMode())
9394
Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
9495

96+
// TODO: in the long run we will want to enable this unconditionally.
97+
if (ST.getTargetTriple().getOS() == Triple::OSType::AMDHSA)
98+
Reg |= S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
99+
95100
if (ST.hasRrWGMode())
96101
Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
97102

llvm/lib/Target/AMDGPU/SIProgramInfo.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
4141
uint32_t DX10Clamp = 0;
4242
uint32_t DebugMode = 0;
4343
uint32_t IEEEMode = 0;
44-
uint32_t WgpMode = 0; // GFX10+
45-
uint32_t MemOrdered = 0; // GFX10+
46-
uint32_t RrWgMode = 0; // GFX12+
44+
uint32_t WgpMode = 0; // GFX10+
45+
uint32_t MemOrdered = 0; // GFX10+
46+
uint32_t FwdProgress = 0; // GFX10+
47+
uint32_t RrWgMode = 0; // GFX12+
4748
const MCExpr *ScratchSize = nullptr;
4849

4950
// State used to calculate fields set in PGM_RSRC2 pm4 packet.

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1331,7 +1331,7 @@ void initDefaultAMDKernelCodeT(AMDGPUMCKernelCodeT &KernelCode,
13311331
if (Version.Major >= 10) {
13321332
KernelCode.compute_pgm_resource_registers |=
13331333
S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) |
1334-
S_00B848_MEM_ORDERED(1);
1334+
S_00B848_MEM_ORDERED(1) | S_00B848_FWD_PROGRESS(1);
13351335
}
13361336
}
13371337

llvm/test/MC/AMDGPU/hsa-gfx12-v4.s

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
3030
// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
3131
// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
32-
// OBJDUMP-NEXT: 0030 00000c60 80000000 00040000 00000000
32+
// OBJDUMP-NEXT: 0030 00000ce0 80000000 00040000 00000000
3333
// complete
3434
// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
3535
// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
@@ -39,12 +39,12 @@
3939
// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
4040
// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
4141
// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
42-
// OBJDUMP-NEXT: 00b0 00000060 80000000 00040000 00000000
42+
// OBJDUMP-NEXT: 00b0 000000e0 80000000 00040000 00000000
4343
// disabled_user_sgpr
4444
// OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000
4545
// OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000
4646
// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
47-
// OBJDUMP-NEXT: 00f0 00000c60 80000000 00040000 00000000
47+
// OBJDUMP-NEXT: 00f0 00000ce0 80000000 00040000 00000000
4848

4949
.text
5050

llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -126,16 +126,16 @@ expr_defined:
126126
// ASM-NEXT: .amdhsa_reserve_vcc defined_boolean
127127
// ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean
128128
// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
129-
// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12
130-
// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14
131-
// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16
132-
// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18
133-
// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21
134-
// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23
135-
// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26
136-
// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29
137-
// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30
138-
// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31
129+
// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12
130+
// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14
131+
// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16
132+
// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18
133+
// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21
134+
// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23
135+
// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26
136+
// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29
137+
// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30
138+
// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31
139139
// ASM-NEXT: .amdhsa_shared_vgpr_count 0
140140
// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&16777216)>>24
141141
// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&33554432)>>25

0 commit comments

Comments
 (0)