[AMDGPU] Increase max scratch allocation for GFX12 #77625

jayfoad · 2024-01-10T17:24:22Z

No description provided.

llvmbot · 2024-01-10T17:24:51Z

@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/77625.diff

4 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+8-4)
(modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+9-5)
(modified) llvm/lib/Target/AMDGPU/SIDefines.h (+4-2)
(modified) llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll (+48-29)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d317a733d4331c..10f7e7a26edb4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -981,8 +981,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
 
     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
     OutStreamer->emitInt32(
-        STM.getGeneration() >= AMDGPUSubtarget::GFX11
-            ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+        STM.getGeneration() >= AMDGPUSubtarget::GFX12
+            ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
+        : STM.getGeneration() == AMDGPUSubtarget::GFX11
+            ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
             : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
 
     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
@@ -993,8 +995,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
     OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
     OutStreamer->emitInt32(
-        STM.getGeneration() >= AMDGPUSubtarget::GFX11
-            ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+        STM.getGeneration() >= AMDGPUSubtarget::GFX12
+            ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
+        : STM.getGeneration() == AMDGPUSubtarget::GFX11
+            ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
             : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
   }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f6f37f5170a403..9840b00812b88b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -295,12 +295,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   unsigned getMaxWaveScratchSize() const {
     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
-    if (getGeneration() < GFX11) {
-      // 13-bit field in units of 256-dword.
-      return (256 * 4) * ((1 << 13) - 1);
+    if (getGeneration() >= GFX12) {
+      // 18-bit field in units of 64-dword.
+      return (64 * 4) * ((1 << 18) - 1);
     }
-    // 15-bit field in units of 64-dword.
-    return (64 * 4) * ((1 << 15) - 1);
+    if (getGeneration() == GFX11) {
+      // 15-bit field in units of 64-dword.
+      return (64 * 4) * ((1 << 15) - 1);
+    }
+    // 13-bit field in units of 256-dword.
+    return (256 * 4) * ((1 << 13) - 1);
   }
 
   /// Return the number of high bits known to be zero for a frame index.
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index b291400a947c7a..d7c5a53717c552 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1172,11 +1172,13 @@ enum Type { TRAP = -2, WORKGROUP = -1 };
 
 #define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
 #define   S_00B860_WAVESIZE_PreGFX11(x)                               (((x) & 0x1FFF) << 12)
-#define   S_00B860_WAVESIZE_GFX11Plus(x)                              (((x) & 0x7FFF) << 12)
+#define   S_00B860_WAVESIZE_GFX11(x)                                  (((x) & 0x7FFF) << 12)
+#define   S_00B860_WAVESIZE_GFX12Plus(x)                              (((x) & 0x3FFFF) << 12)
 
 #define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
 #define   S_0286E8_WAVESIZE_PreGFX11(x)                               (((x) & 0x1FFF) << 12)
-#define   S_0286E8_WAVESIZE_GFX11Plus(x)                              (((x) & 0x7FFF) << 12)
+#define   S_0286E8_WAVESIZE_GFX11(x)                                  (((x) & 0x7FFF) << 12)
+#define   S_0286E8_WAVESIZE_GFX12Plus(x)                              (((x) & 0x3FFFF) << 12)
 
 #define R_028B54_VGT_SHADER_STAGES_EN                                 0x028B54
 #define   S_028B54_HS_W32_EN(x)                                       (((x) & 0x1) << 21)
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index 94295dc3af3136..5882043caa0bd1 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -1,62 +1,81 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
 
-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ffc, [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 {
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
   %alloca = alloca i32, align 4, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
-  %masked = and i32 %toint, 16383
+  %masked = and i32 %toint, 65535
   store volatile i32 %masked, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {
+; SCRATCH128K-NOT: v_and_b32
+; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
   %alloca = alloca i32, align 4, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
-  %masked = and i32 %toint, 65535
+  %masked = and i32 %toint, 131071
   store volatile i32 %masked, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; WAVE64-NOT: [[FI]]
-; WAVE64: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
-
-; WAVE32: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1fffc, [[FI]]
-; WAVE32: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
+; SCRATCH128K-NOT: v_and_b32
+; SCRATCH256K-NOT: v_and_b32
+; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
+; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
   %alloca = alloca i32, align 4, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
-  %masked = and i32 %toint, 131071
+  %masked = and i32 %toint, 262143
   store volatile i32 %masked, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18:
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN-NOT: [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
-define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 {
+; SCRATCH128K-NOT: v_and_b32
+; SCRATCH256K-NOT: v_and_b32
+; SCRATCH1024K-NOT: v_and_b32
+; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0xffffc, [[FI]]
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
   %alloca = alloca i32, align 4, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   %toint = ptrtoint ptr addrspace(5) %alloca to i32
-  %masked = and i32 %toint, 262143
+  %masked = and i32 %toint, 1048575
   store volatile i32 %masked, ptr addrspace(1) undef
   ret void
 }
 
-attributes #0 = { nounwind }
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21:
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN-NOT: v_and_b32
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+define amdgpu_kernel void @scratch_buffer_known_high_masklo21() {
+  %alloca = alloca i32, align 4, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %alloca
+  %toint = ptrtoint ptr addrspace(5) %alloca to i32
+  %masked = and i32 %toint, 2097151
+  store volatile i32 %masked, ptr addrspace(1) undef
+  ret void
+}

arsenm · 2024-01-11T03:49:36Z

llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll


-; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14:


The masklo14 case got lost?

masklo16 tests the same thing. We only need to test one "x" from each of:
x < 17
17 <= x < 18
18 <= x < 20
20 <= x < 21
21 <= x

[AMDGPU] Increase max scratch allocation for GFX12

6ba563a

llvmbot added the backend:AMDGPU label Jan 10, 2024

arsenm reviewed Jan 11, 2024

View reviewed changes

arsenm approved these changes Jan 17, 2024

View reviewed changes

jayfoad merged commit 42b9ea8 into llvm:main Jan 17, 2024

jayfoad deleted the gfx12-max-scratch branch January 17, 2024 10:25

ampandey-1995 pushed a commit to ampandey-1995/llvm-project that referenced this pull request Jan 19, 2024

[AMDGPU] Increase max scratch allocation for GFX12 (llvm#77625)

3c1391d

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Increase max scratch allocation for GFX12 #77625

[AMDGPU] Increase max scratch allocation for GFX12 #77625

Uh oh!

jayfoad commented Jan 10, 2024

Uh oh!

llvmbot commented Jan 10, 2024

Uh oh!

arsenm Jan 11, 2024

Uh oh!

jayfoad Jan 11, 2024

Uh oh!

Uh oh!

[AMDGPU] Increase max scratch allocation for GFX12 #77625

[AMDGPU] Increase max scratch allocation for GFX12 #77625

Uh oh!

Conversation

jayfoad commented Jan 10, 2024

Uh oh!

llvmbot commented Jan 10, 2024

Uh oh!

arsenm Jan 11, 2024

Choose a reason for hiding this comment

Uh oh!

jayfoad Jan 11, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!