Skip to content

Commit 18839ae

Browse files
[AMDGPU] Detect kills in register sets when trying to form V_CMPX instructions. (#68293)
During the SIOptimizeExecMasking pass, we try to form V_CMPX instructions by detecting S_AND_SAVEEXEC and V_MOV instructions. Generally, we require the input operand of the V_MOV, which is the input operand to the to-be-formed V_CMPX, to be alive. This is forced by clearing the kill flags on the operand after V_CMPX has been generated. However, if we have a kill of a register set that contains said register, this will not be detected by clearKillFlags. With this change, possible additional kill-flag candidates will be detected during the final call to findInstrBackwards and then, the kill flag will be removed to keep all registers in the set alive. Co-authored-by: Thomas Symalla <[email protected]>
1 parent 7ef6b21 commit 18839ae

File tree

2 files changed

+72
-8
lines changed

2 files changed

+72
-8
lines changed

llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "GCNSubtarget.h"
1111
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1212
#include "SIRegisterInfo.h"
13+
#include "llvm/ADT/SmallVector.h"
1314
#include "llvm/CodeGen/LivePhysRegs.h"
1415
#include "llvm/CodeGen/MachineFunctionPass.h"
1516
#include "llvm/CodeGen/MachineOperand.h"
@@ -32,6 +33,7 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
3233

3334
DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
3435
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
36+
SmallVector<MachineOperand *, 1> KillFlagCandidates;
3537

3638
Register isCopyFromExec(const MachineInstr &MI) const;
3739
Register isCopyToExec(const MachineInstr &MI) const;
@@ -41,15 +43,16 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
4143
MachineBasicBlock::reverse_iterator
4244
findExecCopy(MachineBasicBlock &MBB,
4345
MachineBasicBlock::reverse_iterator I) const;
44-
4546
bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
4647
MCRegister Reg, bool UseLiveOuts = false,
4748
bool IgnoreStart = false) const;
4849
bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const;
49-
MachineInstr *findInstrBackwards(MachineInstr &Origin,
50-
std::function<bool(MachineInstr *)> Pred,
51-
ArrayRef<MCRegister> NonModifiableRegs,
52-
unsigned MaxInstructions = 20) const;
50+
MachineInstr *findInstrBackwards(
51+
MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
52+
ArrayRef<MCRegister> NonModifiableRegs,
53+
MachineInstr *Terminator = nullptr,
54+
SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr,
55+
unsigned MaxInstructions = 20) const;
5356
bool optimizeExecSequence();
5457
void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
5558
bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
@@ -325,11 +328,13 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
325328
// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
326329
// the beginning of the BB is reached or Pred evaluates to true - which can be
327330
// an arbitrary condition based on the current MachineInstr, for instance an
328-
// target instruction. Breaks prematurely by returning nullptr if one of the
331+
// target instruction. Breaks prematurely by returning nullptr if one of the
329332
// registers given in NonModifiableRegs is modified by the current instruction.
330333
MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
331334
MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
332-
ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
335+
ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator,
336+
SmallVectorImpl<MachineOperand *> *KillFlagCandidates,
337+
unsigned MaxInstructions) const {
333338
MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
334339
E = Origin.getParent()->rend();
335340
unsigned CurrentIteration = 0;
@@ -344,6 +349,21 @@ MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
344349
for (MCRegister Reg : NonModifiableRegs) {
345350
if (A->modifiesRegister(Reg, TRI))
346351
return nullptr;
352+
353+
// Check for kills that appear after the terminator instruction, that
354+
// would not be detected by clearKillFlags, since they will cause the
355+
// register to be dead at a later place, causing the verifier to fail.
356+
// We use the candidates to clear the kill flags later.
357+
if (Terminator && KillFlagCandidates && A != Terminator &&
358+
A->killsRegister(Reg, TRI)) {
359+
for (MachineOperand &MO : A->operands()) {
360+
if (MO.isReg() && MO.isKill()) {
361+
Register Candidate = MO.getReg();
362+
if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg))
363+
KillFlagCandidates->push_back(&MO);
364+
}
365+
}
366+
}
347367
}
348368

349369
++CurrentIteration;
@@ -599,6 +619,9 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
599619
if (Src1->isReg())
600620
MRI->clearKillFlags(Src1->getReg());
601621

622+
for (MachineOperand *MO : KillFlagCandidates)
623+
MO->setIsKill(false);
624+
602625
SaveExecInstr.eraseFromParent();
603626
VCmp.eraseFromParent();
604627

@@ -690,7 +713,8 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
690713
NonDefRegs.push_back(Src1->getReg());
691714

692715
if (!findInstrBackwards(
693-
MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
716+
MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs,
717+
VCmp, &KillFlagCandidates))
694718
return;
695719

696720
if (VCmp)
@@ -777,6 +801,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
777801

778802
OrXors.clear();
779803
SaveExecVCmpMapping.clear();
804+
KillFlagCandidates.clear();
780805
static unsigned SearchWindow = 10;
781806
for (MachineBasicBlock &MBB : MF) {
782807
unsigned SearchCount = 0;
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
2+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX1100 %s
3+
4+
---
5+
6+
name: vcmp_saveexec_to_vcmpx_set_kill
7+
tracksRegLiveness: true
8+
body: |
9+
bb.0:
10+
liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
11+
12+
; GFX1100-LABEL: name: vcmp_saveexec_to_vcmpx_set_kill
13+
; GFX1100: liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
14+
; GFX1100-NEXT: {{ $}}
15+
; GFX1100-NEXT: renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
16+
; GFX1100-NEXT: renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
17+
; GFX1100-NEXT: renamable $sgpr4 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec
18+
; GFX1100-NEXT: renamable $sgpr0 = S_MOV_B32 0
19+
; GFX1100-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
20+
; GFX1100-NEXT: renamable $sgpr2 = COPY renamable $sgpr0
21+
; GFX1100-NEXT: renamable $sgpr3 = COPY renamable $sgpr0
22+
; GFX1100-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
23+
; GFX1100-NEXT: renamable $sgpr68 = COPY renamable $sgpr66
24+
; GFX1100-NEXT: $sgpr5 = S_MOV_B32 $exec_lo
25+
; GFX1100-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, 0, implicit-def $exec, implicit $exec
26+
renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
27+
renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
28+
renamable $vcc_lo = V_CMP_EQ_U32_e64 $vgpr0, 0, implicit $exec
29+
renamable $sgpr4 = V_CMP_NE_U32_e64 0, killed $vgpr0, implicit $exec
30+
renamable $sgpr0 = S_MOV_B32 0
31+
renamable $sgpr1 = COPY renamable $sgpr0
32+
renamable $sgpr2 = COPY renamable $sgpr0
33+
renamable $sgpr3 = COPY renamable $sgpr0
34+
BUFFER_STORE_DWORDX2_OFFSET_exact killed $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
35+
renamable $sgpr68 = COPY renamable $sgpr66
36+
renamable $sgpr5 = COPY $exec_lo, implicit-def $exec_lo
37+
renamable $sgpr6 = S_AND_B32 renamable $sgpr5, killed renamable $vcc_lo, implicit-def dead $scc
38+
$exec_lo = S_MOV_B32_term killed renamable $sgpr6
39+
...

0 commit comments

Comments
 (0)