Skip to content

Commit 0feb4bc

Browse files
perlfutstellar
authored andcommitted
Fix missed SI_RETURN_TO_EPILOG in pre-emit peephole
SIPreEmitPeephole does not process all terminators, which means it can fail to handle SI_RETURN_TO_EPILOG if immediately preceeded by a branch to the early exit block. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D85872 (cherry picked from commit d538c58)
1 parent b430f94 commit 0feb4bc

File tree

2 files changed

+85
-8
lines changed

2 files changed

+85
-8
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -254,16 +254,24 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
254254

255255
for (MachineBasicBlock &MBB : MF) {
256256
MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
257-
if (MBBE != MBB.end()) {
258-
MachineInstr &MI = *MBBE;
257+
MachineBasicBlock::iterator TermI = MBBE;
258+
// Check first terminator for VCC branches to optimize
259+
if (TermI != MBB.end()) {
260+
MachineInstr &MI = *TermI;
259261
switch (MI.getOpcode()) {
260262
case AMDGPU::S_CBRANCH_VCCZ:
261263
case AMDGPU::S_CBRANCH_VCCNZ:
262264
Changed |= optimizeVccBranch(MI);
263265
continue;
264-
case AMDGPU::SI_RETURN_TO_EPILOG:
265-
// FIXME: This is not an optimization and should be
266-
// moved somewhere else.
266+
default:
267+
break;
268+
}
269+
}
270+
// Check all terminators for SI_RETURN_TO_EPILOG
271+
// FIXME: This is not an optimization and should be moved somewhere else.
272+
while (TermI != MBB.end()) {
273+
MachineInstr &MI = *TermI;
274+
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
267275
assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
268276

269277
// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
@@ -281,11 +289,11 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
281289
.addMBB(EmptyMBBAtEnd);
282290
MI.eraseFromParent();
283291
MBBE = MBB.getFirstTerminator();
292+
TermI = MBBE;
293+
continue;
284294
}
285-
break;
286-
default:
287-
break;
288295
}
296+
TermI++;
289297
}
290298

291299
if (!ST.hasVGPRIndexMode())

llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,73 @@ else: ; preds = %else.if.cond
7878
unreachable
7979
}
8080

81+
define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(float %val) #0 {
82+
; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill
83+
; GCN: bb.0.entry:
84+
; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000)
85+
; GCN: liveins: $vgpr0
86+
; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
87+
; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec
88+
; GCN: $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
89+
; GCN: renamable $sgpr0_sgpr1 = S_XOR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
90+
; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec
91+
; GCN: bb.1.flow.preheader:
92+
; GCN: successors: %bb.2(0x80000000)
93+
; GCN: liveins: $vgpr0, $sgpr0_sgpr1
94+
; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
95+
; GCN: renamable $sgpr2_sgpr3 = S_MOV_B64 0
96+
; GCN: bb.2.flow:
97+
; GCN: successors: %bb.3(0x04000000), %bb.2(0x7c000000)
98+
; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3
99+
; GCN: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
100+
; GCN: renamable $sgpr2_sgpr3 = S_OR_B64 killed renamable $sgpr4_sgpr5, killed renamable $sgpr2_sgpr3, implicit-def $scc
101+
; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc
102+
; GCN: S_CBRANCH_EXECNZ %bb.2, implicit $exec
103+
; GCN: bb.3.Flow:
104+
; GCN: successors: %bb.4(0x80000000)
105+
; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
106+
; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
107+
; GCN: bb.4.Flow1:
108+
; GCN: successors: %bb.5(0x40000000), %bb.6(0x40000000)
109+
; GCN: liveins: $sgpr0_sgpr1
110+
; GCN: renamable $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec
111+
; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
112+
; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec
113+
; GCN: bb.5.kill0:
114+
; GCN: successors: %bb.6(0x80000000)
115+
; GCN: liveins: $sgpr0_sgpr1
116+
; GCN: $exec = S_MOV_B64 0
117+
; GCN: bb.6.end:
118+
; GCN: successors: %bb.7(0x40000000), %bb.8(0x40000000)
119+
; GCN: liveins: $sgpr0_sgpr1
120+
; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
121+
; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec
122+
; GCN: S_BRANCH %bb.8
123+
; GCN: bb.7:
124+
; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
125+
; GCN: S_ENDPGM 0
126+
; GCN: bb.8:
127+
entry:
128+
%.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val
129+
%cmp0 = fcmp olt float %.i0, 0.000000e+00
130+
br i1 %cmp0, label %kill0, label %flow
131+
132+
kill0: ; preds = %entry
133+
call void @llvm.amdgcn.kill(i1 false)
134+
br label %end
135+
136+
flow: ; preds = %entry
137+
%cmp1 = fcmp olt float %val, 0.000000e+00
138+
br i1 %cmp1, label %flow, label %end
139+
140+
kill1: ; preds = %flow
141+
call void @llvm.amdgcn.kill(i1 false)
142+
br label %end
143+
144+
end: ; preds = %kill0, %kill1, %flow
145+
ret { <4 x float> } undef
146+
}
147+
148+
declare void @llvm.amdgcn.kill(i1) #0
149+
81150
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)