Skip to content

Commit 5a556d5

Browse files
authored
AMDGPU: Increase the LDS size to support to 160 KB for gfx950 (#116309)
1 parent cab7328 commit 5a556d5

File tree

10 files changed

+173
-9
lines changed

10 files changed

+173
-9
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5475,6 +5475,8 @@ The fields used by CP for code objects before V3 also match those specified in
54755475
roundup(lds-size / (64 * 4))
54765476
GFX7-GFX11
54775477
roundup(lds-size / (128 * 4))
5478+
GFX950
5479+
roundup(lds-size / (320 * 4))
54785480

54795481
24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution
54805482
_INVALID_OPERATION with specified exceptions

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,7 +1192,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
11921192

11931193
def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
11941194
"gfx9",
1195-
[FeatureFP64, FeatureAddressableLocalMemorySize65536,
1195+
[FeatureFP64,
11961196
FeatureWavefrontSize64, FeatureFlatAddressSpace,
11971197
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
11981198
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
@@ -1358,6 +1358,7 @@ def FeatureISAVersion8_1_0 : FeatureSet<
13581358

13591359
def FeatureISAVersion9_0_Common : FeatureSet<
13601360
[FeatureGFX9,
1361+
FeatureAddressableLocalMemorySize65536,
13611362
FeatureLDSBankCount32,
13621363
FeatureImageInsts,
13631364
FeatureMadMacF32Insts]>;
@@ -1375,7 +1376,8 @@ def FeatureISAVersion9_Generic : FeatureSet<
13751376

13761377
def FeatureISAVersion9_0_MI_Common : FeatureSet<
13771378
!listconcat(FeatureISAVersion9_0_Common.Features,
1378-
[FeatureFmaMixInsts,
1379+
[FeatureAddressableLocalMemorySize65536,
1380+
FeatureFmaMixInsts,
13791381
FeatureDLInsts,
13801382
FeatureDot1Insts,
13811383
FeatureDot2Insts,
@@ -1491,15 +1493,17 @@ def FeatureISAVersion9_4_Common : FeatureSet<
14911493

14921494
def FeatureISAVersion9_5_Common : FeatureSet<
14931495
!listconcat(FeatureISAVersion9_4_Common.Features,
1494-
[FeatureFP8Insts,
1496+
[FeatureAddressableLocalMemorySize163840,
1497+
FeatureFP8Insts,
14951498
FeatureFP8ConversionInsts,
14961499
FeatureCvtFP8VOP1Bug,
1497-
FeatureGFX950Insts
1500+
FeatureGFX950Insts,
14981501
])>;
14991502

15001503
def FeatureISAVersion9_4_0 : FeatureSet<
15011504
!listconcat(FeatureISAVersion9_4_Common.Features,
15021505
[
1506+
FeatureAddressableLocalMemorySize65536,
15031507
FeatureForceStoreSC0SC1,
15041508
FeatureFP8Insts,
15051509
FeatureFP8ConversionInsts,
@@ -1510,6 +1514,7 @@ def FeatureISAVersion9_4_0 : FeatureSet<
15101514
def FeatureISAVersion9_4_1 : FeatureSet<
15111515
!listconcat(FeatureISAVersion9_4_Common.Features,
15121516
[
1517+
FeatureAddressableLocalMemorySize65536,
15131518
FeatureForceStoreSC0SC1,
15141519
FeatureFP8Insts,
15151520
FeatureFP8ConversionInsts,
@@ -1520,6 +1525,7 @@ def FeatureISAVersion9_4_1 : FeatureSet<
15201525
def FeatureISAVersion9_4_2 : FeatureSet<
15211526
!listconcat(FeatureISAVersion9_4_Common.Features,
15221527
[
1528+
FeatureAddressableLocalMemorySize65536,
15231529
FeatureFP8Insts,
15241530
FeatureFP8ConversionInsts,
15251531
FeatureCvtFP8VOP1Bug,
@@ -1528,7 +1534,8 @@ def FeatureISAVersion9_4_2 : FeatureSet<
15281534

15291535
def FeatureISAVersion9_4_Generic : FeatureSet<
15301536
!listconcat(FeatureISAVersion9_4_Common.Features,
1531-
[FeatureRequiresCOV6])>;
1537+
[FeatureAddressableLocalMemorySize65536,
1538+
FeatureRequiresCOV6])>;
15321539

15331540
def FeatureISAVersion9_5_0 : FeatureSet<FeatureISAVersion9_5_Common.Features>;
15341541

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,12 +1172,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
11721172
ProgInfo.DX10Clamp = Mode.DX10Clamp;
11731173

11741174
unsigned LDSAlignShift;
1175-
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
1176-
// LDS is allocated in 64 dword blocks.
1177-
LDSAlignShift = 8;
1178-
} else {
1175+
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
1176+
// LDS is allocated in 320 dword blocks.
1177+
LDSAlignShift = 11;
1178+
} else if (STM.getFeatureBits().test(
1179+
FeatureAddressableLocalMemorySize65536)) {
11791180
// LDS is allocated in 128 dword blocks.
11801181
LDSAlignShift = 9;
1182+
} else {
1183+
// LDS is allocated in 64 dword blocks.
1184+
LDSAlignShift = 8;
11811185
}
11821186

11831187
ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();

llvm/lib/Target/AMDGPU/AMDGPUFeatures.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
2929

3030
def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
3131
def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
32+
def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
3233

3334
class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
3435
"wavefrontsize"#!shl(1, ValueLog2),

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
916916
return 32768;
917917
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
918918
return 65536;
919+
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
920+
return 163840;
919921
return 0;
920922
}
921923

llvm/test/CodeGen/AMDGPU/extra-lds-size.ll

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-MESA %s
33
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-PAL %s
44
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-MESA %s
5+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-PAL %s
6+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
57
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
68
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
79

@@ -17,6 +19,11 @@
1719
; GFX11-MESA: .long 45100
1820
; GFX11-MESA-NEXT: .long 1024
1921

22+
; GFX950-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
23+
24+
; GFX950-MESA: .long 45100
25+
; GFX950-MESA-NEXT: .long 512
26+
2027
; GFX1200-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400
2128

2229
; GFX1200-MESA: .long 45100
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT160K %s
2+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
3+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
4+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
5+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx941 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
6+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
7+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
8+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
9+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
10+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
11+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
12+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
13+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
14+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
15+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
16+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
17+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
18+
; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT32K %s
19+
20+
; gfx950 supports upto 160 KB LDS memory. The generic target does not.
21+
; This is a negative test to check when the LDS size exceeds the max usable limit.
22+
23+
; ERROR-LIMIT160K: error: <unknown>:0:0: local memory (163844) exceeds limit (163840) in function 'test_lds_limit'
24+
; ERROR-LIMIT64K: error: <unknown>:0:0: local memory (163844) exceeds limit (65536) in function 'test_lds_limit'
25+
; ERROR-LIMIT32K: error: <unknown>:0:0: local memory (163844) exceeds limit (32768) in function 'test_lds_limit'
26+
@dst = addrspace(3) global [40961 x i32] poison
27+
28+
define amdgpu_kernel void @test_lds_limit(i32 %val) {
29+
%gep = getelementptr [40961 x i32], ptr addrspace(3) @dst, i32 0, i32 100
30+
store i32 %val, ptr addrspace(3) %gep
31+
ret void
32+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=MESA %s
3+
4+
; gfx950 supports upto 160 KB configurable LDS memory.
5+
; This test checks the max and above the old i.e. 128 KiB size of LDS that can be allocated.
6+
7+
@lds.i32 = addrspace(3) global i32 poison
8+
@lds.array.size.131076 = addrspace(3) global [32768 x i32] poison
9+
@lds.array.size.163840 = addrspace(3) global [40959 x i32] poison
10+
11+
; GCN-LABEL: test_lds_array_size_131076:
12+
; GCN: .amdhsa_group_segment_fixed_size 131076
13+
; GCN: ; LDSByteSize: 131076 bytes/workgroup
14+
; MESA: granulated_lds_size = 65
15+
define amdgpu_kernel void @test_lds_array_size_131076() {
16+
%gep = getelementptr inbounds [32768 x i32], ptr addrspace(3) @lds.array.size.131076, i32 0, i32 20
17+
%val = load i32, ptr addrspace(3) %gep
18+
store i32 %val, ptr addrspace(3) @lds.i32
19+
ret void
20+
}
21+
22+
; GCN-LABEL: test_lds_array_size_163840:
23+
; GCN: .amdhsa_group_segment_fixed_size 163840
24+
; GCN: ; LDSByteSize: 163840 bytes/workgroup
25+
; MESA: granulated_lds_size = 80
26+
define amdgpu_kernel void @test_lds_array_size_163840() {
27+
%gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.size.163840 , i32 0, i32 20
28+
%val = load i32, ptr addrspace(3) %gep
29+
store i32 %val, ptr addrspace(3) @lds.i32
30+
ret void
31+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=PAL %s
2+
3+
; GFX950supports upto 160 KB configurable LDS memory.
4+
; This test checks the min and max size of LDS that can be allocated.
5+
6+
; PAL: .shader_functions:
7+
; PAL: test_lds_array_i32:
8+
; PAL: .lds_size: 0x28000
9+
; PAL: test_lds_i32:
10+
; PAL: .lds_size: 0x4
11+
12+
13+
@lds.i32 = addrspace(3) global i32 poison
14+
@lds.array.i32 = addrspace(3) global [40959 x i32] poison
15+
16+
define amdgpu_gfx void @test_lds_i32(i32 %val) {
17+
store i32 %val, ptr addrspace(3) @lds.i32
18+
ret void
19+
}
20+
21+
define amdgpu_gfx void @test_lds_array_i32() {
22+
%gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
23+
%val = load i32, ptr addrspace(3) %gep
24+
store i32 %val, ptr addrspace(3) @lds.i32
25+
ret void
26+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
;; Test disassembly for gfx950 kernel descriptor.
2+
3+
; RUN: rm -rf %t && split-file %s %t && cd %t
4+
5+
;--- 1.s
6+
; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1.s > 1.o
7+
; RUN: llvm-objdump --disassemble-symbols=kernel.kd 1.o | tail -n +7 | tee 1-disasm.s | FileCheck 1.s
8+
; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1-disasm.s > 1-disasm.o
9+
; FIxMe: cmp 1.o 1-disasm.o
10+
; CHECK: .amdhsa_kernel kernel
11+
; CHECK-NEXT: .amdhsa_group_segment_fixed_size 163840
12+
; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0
13+
; CHECK-NEXT: .amdhsa_kernarg_size 0
14+
; CHECK-NEXT: .amdhsa_accum_offset 4
15+
; CHECK-NEXT: .amdhsa_tg_split 0
16+
; CHECK-NEXT: .amdhsa_next_free_vgpr 8
17+
; CHECK-NEXT: .amdhsa_reserve_vcc 0
18+
; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0
19+
; CHECK-NEXT: .amdhsa_next_free_sgpr 8
20+
; CHECK-NEXT: .amdhsa_float_round_mode_32 0
21+
; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0
22+
; CHECK-NEXT: .amdhsa_float_denorm_mode_32 0
23+
; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3
24+
; CHECK-NEXT: .amdhsa_dx10_clamp 1
25+
; CHECK-NEXT: .amdhsa_ieee_mode 1
26+
; CHECK-NEXT: .amdhsa_fp16_overflow 0
27+
; CHECK-NEXT: .amdhsa_enable_private_segment 0
28+
; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
29+
; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
30+
; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
31+
; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0
32+
; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0
33+
; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
34+
; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0
35+
; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
36+
; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0
37+
; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0
38+
; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0
39+
; CHECK-NEXT: .amdhsa_exception_int_div_zero 0
40+
; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
41+
; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0
42+
; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
43+
; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
44+
; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
45+
; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
46+
; CHECK-NEXT:.end_amdhsa_kernel
47+
.amdhsa_kernel kernel
48+
.amdhsa_group_segment_fixed_size 163840
49+
.amdhsa_next_free_vgpr 0
50+
.amdhsa_next_free_sgpr 0
51+
.amdhsa_accum_offset 4
52+
.end_amdhsa_kernel

0 commit comments

Comments
 (0)