Skip to content

Commit 22e3dc6

Browse files
committed
AMDGPU: Fix not using s33 for scratch wave offset in kernels
Fixes missing piece from r363990. llvm-svn: 364099
1 parent 7f9c9f2 commit 22e3dc6

File tree

4 files changed

+22
-16
lines changed

4 files changed

+22
-16
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1829,11 +1829,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
18291829
Info.setScratchRSrcReg(ReservedBufferReg);
18301830
}
18311831

1832-
// This should be accurate for kernels even before the frame is finalized.
1833-
const bool HasFP = ST.getFrameLowering()->hasFP(MF);
1834-
if (HasFP) {
1835-
unsigned ReservedOffsetReg =
1836-
TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1832+
// hasFP should be accurate for kernels even before the frame is finalized.
1833+
if (ST.getFrameLowering()->hasFP(MF)) {
18371834
MachineRegisterInfo &MRI = MF.getRegInfo();
18381835

18391836
// Try to use s32 as the SP, but move it if it would interfere with input
@@ -1860,8 +1857,15 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
18601857
report_fatal_error("failed to find register for SP");
18611858
}
18621859

1863-
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1864-
Info.setFrameOffsetReg(ReservedOffsetReg);
1860+
if (MFI.hasCalls()) {
1861+
Info.setScratchWaveOffsetReg(AMDGPU::SGPR33);
1862+
Info.setFrameOffsetReg(AMDGPU::SGPR33);
1863+
} else {
1864+
unsigned ReservedOffsetReg =
1865+
TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1866+
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1867+
Info.setFrameOffsetReg(ReservedOffsetReg);
1868+
}
18651869
} else if (RequiresStackAccess) {
18661870
assert(!MFI.hasCalls());
18671871
// We know there are accesses and they will be done relative to SP, so just

llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,9 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)
104104
}
105105

106106
; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31:
107-
; GCN: s_mov_b32 s33, s31
107+
; GCN: s_mov_b32 s34, s31
108108
; GCN-NEXT: s_swappc_b64
109-
; GCN-NEXT: s_mov_b32 s31, s33
109+
; GCN-NEXT: s_mov_b32 s31, s34
110110
define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
111111
%s31 = call i32 asm sideeffect "; def $0", "={s31}"()
112112
call void @external_void_func_void()
@@ -128,15 +128,14 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
128128
; FIXME: What is the expected behavior for reserved registers here?
129129

130130
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
131-
; GCN: s_mov_b32 s34, s9
132-
; GCN-NOT: s33
131+
; GCN: s_mov_b32 s33, s9
132+
; GCN: s_mov_b32 s32, s33
133133
; GCN: #ASMSTART
134134
; GCN-NEXT: ; def s33
135135
; GCN-NEXT: #ASMEND
136136
; GCN: s_getpc_b64 s[4:5]
137137
; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
138138
; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4
139-
; GCN: s_mov_b32 s32, s34
140139
; GCN: s_swappc_b64 s[30:31], s[4:5]
141140
; GCN: ;;#ASMSTART
142141
; GCN-NEXT: ; use s33

llvm/test/CodeGen/AMDGPU/ipra.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
3030
; GCN-NOT: writelane
3131
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
3232

33-
; GCN: ; NumSgprs: 37
33+
; GCN: ; NumSgprs: 38
3434
; GCN: ; NumVgprs: 9
3535
define amdgpu_kernel void @kernel_call() #0 {
3636
%vgpr = load volatile i32, i32 addrspace(1)* undef

llvm/test/CodeGen/AMDGPU/stack-realign.ll

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ define void @force_realign4(i32 %idx) #1 {
9191
}
9292

9393
; GCN-LABEL: {{^}}kernel_call_align16_from_8:
94-
; GCN: s_add_u32 s32, s8, 0x400{{$}}
94+
; GCN: s_mov_b32 s33, s7{{$}}
95+
; GCN-NEXT: s_add_u32 s32, s33, 0x400{{$}}
9596
; GCN-NOT: s32
9697
; GCN: s_swappc_b64
9798
define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
@@ -103,7 +104,8 @@ define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
103104

104105
; The call sequence should keep the stack on call aligned to 4
105106
; GCN-LABEL: {{^}}kernel_call_align16_from_5:
106-
; GCN: s_add_u32 s32, s8, 0x400
107+
; GCN: s_mov_b32 s33, s7{{$}}
108+
; GCN-NEXT: s_add_u32 s32, s33, 0x400
107109
; GCN: s_swappc_b64
108110
define amdgpu_kernel void @kernel_call_align16_from_5() {
109111
%alloca0 = alloca i8, align 1, addrspace(5)
@@ -114,7 +116,8 @@ define amdgpu_kernel void @kernel_call_align16_from_5() {
114116
}
115117

116118
; GCN-LABEL: {{^}}kernel_call_align4_from_5:
117-
; GCN: s_add_u32 s32, s8, 0x400
119+
; GCN: s_mov_b32 s33, s7{{$}}
120+
; GCN: s_add_u32 s32, s33, 0x400
118121
; GCN: s_swappc_b64
119122
define amdgpu_kernel void @kernel_call_align4_from_5() {
120123
%alloca0 = alloca i8, align 1, addrspace(5)

0 commit comments

Comments
 (0)