-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Increase max scratch allocation for GFX12 #77625
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesFull diff: https://github.com/llvm/llvm-project/pull/77625.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d317a733d4331c..10f7e7a26edb4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -981,8 +981,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX11
- ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ STM.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
+ : STM.getGeneration() == AMDGPUSubtarget::GFX11
+ ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
: S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
@@ -993,8 +995,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX11
- ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ STM.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
+ : STM.getGeneration() == AMDGPUSubtarget::GFX11
+ ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
: S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f6f37f5170a403..9840b00812b88b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -295,12 +295,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
unsigned getMaxWaveScratchSize() const {
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
- if (getGeneration() < GFX11) {
- // 13-bit field in units of 256-dword.
- return (256 * 4) * ((1 << 13) - 1);
+ if (getGeneration() >= GFX12) {
+ // 18-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 18) - 1);
}
- // 15-bit field in units of 64-dword.
- return (64 * 4) * ((1 << 15) - 1);
+ if (getGeneration() == GFX11) {
+ // 15-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 15) - 1);
+ }
+ // 13-bit field in units of 256-dword.
+ return (256 * 4) * ((1 << 13) - 1);
}
/// Return the number of high bits known to be zero for a frame index.
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index b291400a947c7a..d7c5a53717c552 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1172,11 +1172,13 @@ enum Type { TRAP = -2, WORKGROUP = -1 };
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
#define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
-#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
+#define S_00B860_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12)
+#define S_00B860_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12)
#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
#define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
-#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
+#define S_0286E8_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12)
+#define S_0286E8_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12)
#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54
#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21)
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index 94295dc3af3136..5882043caa0bd1 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -1,62 +1,81 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ffc, [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 {
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
%toint = ptrtoint ptr addrspace(5) %alloca to i32
- %masked = and i32 %toint, 16383
+ %masked = and i32 %toint, 65535
store volatile i32 %masked, ptr addrspace(1) undef
ret void
}
-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {
+; SCRATCH128K-NOT: v_and_b32
+; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
%toint = ptrtoint ptr addrspace(5) %alloca to i32
- %masked = and i32 %toint, 65535
+ %masked = and i32 %toint, 131071
store volatile i32 %masked, ptr addrspace(1) undef
ret void
}
-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; WAVE64-NOT: [[FI]]
-; WAVE64: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
-
-; WAVE32: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1fffc, [[FI]]
-; WAVE32: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
+; SCRATCH128K-NOT: v_and_b32
+; SCRATCH256K-NOT: v_and_b32
+; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
+; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
%toint = ptrtoint ptr addrspace(5) %alloca to i32
- %masked = and i32 %toint, 131071
+ %masked = and i32 %toint, 262143
store volatile i32 %masked, ptr addrspace(1) undef
ret void
}
-; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN-NOT: [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
-define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 {
+; SCRATCH128K-NOT: v_and_b32
+; SCRATCH256K-NOT: v_and_b32
+; SCRATCH1024K-NOT: v_and_b32
+; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0xffffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
%toint = ptrtoint ptr addrspace(5) %alloca to i32
- %masked = and i32 %toint, 262143
+ %masked = and i32 %toint, 1048575
store volatile i32 %masked, ptr addrspace(1) undef
ret void
}
-attributes #0 = { nounwind }
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21:
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN-NOT: v_and_b32
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo21() {
+ %alloca = alloca i32, align 4, addrspace(5)
+ store volatile i32 0, ptr addrspace(5) %alloca
+ %toint = ptrtoint ptr addrspace(5) %alloca to i32
+ %masked = and i32 %toint, 2097151
+ store volatile i32 %masked, ptr addrspace(1) undef
+ ret void
+}
|
|
||
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The masklo14 case got lost?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
masklo16 tests the same thing. We only need to test one "x" from each of:
x < 17
17 <= x < 18
18 <= x < 20
20 <= x < 21
21 <= x
No description provided.