Skip to content

Commit 42b9ea8

Browse files
authored
[AMDGPU] Increase max scratch allocation for GFX12 (llvm#77625)
1 parent 36ef291 commit 42b9ea8

File tree

4 files changed

+69
-40
lines changed

4 files changed

+69
-40
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -981,8 +981,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
981981

982982
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
983983
OutStreamer->emitInt32(
984-
STM.getGeneration() >= AMDGPUSubtarget::GFX11
985-
? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
984+
STM.getGeneration() >= AMDGPUSubtarget::GFX12
985+
? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
986+
: STM.getGeneration() == AMDGPUSubtarget::GFX11
987+
? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
986988
: S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
987989

988990
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
@@ -993,8 +995,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
993995
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
994996
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
995997
OutStreamer->emitInt32(
996-
STM.getGeneration() >= AMDGPUSubtarget::GFX11
997-
? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
998+
STM.getGeneration() >= AMDGPUSubtarget::GFX12
999+
? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
1000+
: STM.getGeneration() == AMDGPUSubtarget::GFX11
1001+
? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
9981002
: S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
9991003
}
10001004

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -297,12 +297,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
297297

298298
unsigned getMaxWaveScratchSize() const {
299299
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
300-
if (getGeneration() < GFX11) {
301-
// 13-bit field in units of 256-dword.
302-
return (256 * 4) * ((1 << 13) - 1);
300+
if (getGeneration() >= GFX12) {
301+
// 18-bit field in units of 64-dword.
302+
return (64 * 4) * ((1 << 18) - 1);
303303
}
304-
// 15-bit field in units of 64-dword.
305-
return (64 * 4) * ((1 << 15) - 1);
304+
if (getGeneration() == GFX11) {
305+
// 15-bit field in units of 64-dword.
306+
return (64 * 4) * ((1 << 15) - 1);
307+
}
308+
// 13-bit field in units of 256-dword.
309+
return (256 * 4) * ((1 << 13) - 1);
306310
}
307311

308312
/// Return the number of high bits known to be zero for a frame index.

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,11 +1176,13 @@ enum Type { TRAP = -2, WORKGROUP = -1 };
11761176

11771177
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
11781178
#define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
1179-
#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
1179+
#define S_00B860_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12)
1180+
#define S_00B860_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12)
11801181

11811182
#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
11821183
#define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
1183-
#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
1184+
#define S_0286E8_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12)
1185+
#define S_0286E8_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12)
11841186

11851187
#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54
11861188
#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21)
Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,81 @@
1-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
2-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
3-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
4-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
5-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
5+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
68

7-
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14:
9+
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
810
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
9-
; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ffc, [[FI]]
10-
; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
11-
define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 {
11+
; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]]
12+
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
13+
define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
1214
%alloca = alloca i32, align 4, addrspace(5)
1315
store volatile i32 0, ptr addrspace(5) %alloca
1416
%toint = ptrtoint ptr addrspace(5) %alloca to i32
15-
%masked = and i32 %toint, 16383
17+
%masked = and i32 %toint, 65535
1618
store volatile i32 %masked, ptr addrspace(1) undef
1719
ret void
1820
}
1921

20-
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
22+
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
2123
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
22-
; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]]
23-
; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
24-
define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {
24+
; SCRATCH128K-NOT: v_and_b32
25+
; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
26+
; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
27+
; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
28+
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
29+
define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
2530
%alloca = alloca i32, align 4, addrspace(5)
2631
store volatile i32 0, ptr addrspace(5) %alloca
2732
%toint = ptrtoint ptr addrspace(5) %alloca to i32
28-
%masked = and i32 %toint, 65535
33+
%masked = and i32 %toint, 131071
2934
store volatile i32 %masked, ptr addrspace(1) undef
3035
ret void
3136
}
3237

33-
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
38+
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18:
3439
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
35-
; WAVE64-NOT: [[FI]]
36-
; WAVE64: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
37-
38-
; WAVE32: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1fffc, [[FI]]
39-
; WAVE32: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
40-
define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
40+
; SCRATCH128K-NOT: v_and_b32
41+
; SCRATCH256K-NOT: v_and_b32
42+
; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
43+
; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
44+
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
45+
define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
4146
%alloca = alloca i32, align 4, addrspace(5)
4247
store volatile i32 0, ptr addrspace(5) %alloca
4348
%toint = ptrtoint ptr addrspace(5) %alloca to i32
44-
%masked = and i32 %toint, 131071
49+
%masked = and i32 %toint, 262143
4550
store volatile i32 %masked, ptr addrspace(1) undef
4651
ret void
4752
}
4853

49-
; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18:
54+
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20:
5055
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
51-
; GCN-NOT: [[FI]]
52-
; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
53-
define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 {
56+
; SCRATCH128K-NOT: v_and_b32
57+
; SCRATCH256K-NOT: v_and_b32
58+
; SCRATCH1024K-NOT: v_and_b32
59+
; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0xffffc, [[FI]]
60+
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
61+
define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
5462
%alloca = alloca i32, align 4, addrspace(5)
5563
store volatile i32 0, ptr addrspace(5) %alloca
5664
%toint = ptrtoint ptr addrspace(5) %alloca to i32
57-
%masked = and i32 %toint, 262143
65+
%masked = and i32 %toint, 1048575
5866
store volatile i32 %masked, ptr addrspace(1) undef
5967
ret void
6068
}
6169

62-
attributes #0 = { nounwind }
70+
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21:
71+
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
72+
; GCN-NOT: v_and_b32
73+
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
74+
define amdgpu_kernel void @scratch_buffer_known_high_masklo21() {
75+
%alloca = alloca i32, align 4, addrspace(5)
76+
store volatile i32 0, ptr addrspace(5) %alloca
77+
%toint = ptrtoint ptr addrspace(5) %alloca to i32
78+
%masked = and i32 %toint, 2097151
79+
store volatile i32 %masked, ptr addrspace(1) undef
80+
ret void
81+
}

0 commit comments

Comments
 (0)