Skip to content

[AMDGPU] Detect kills in register sets when trying to form V_CMPX instructions. #68293

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 33 additions & 8 deletions llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIRegisterInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"
Expand All @@ -32,6 +33,7 @@ class SIOptimizeExecMasking : public MachineFunctionPass {

DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
SmallVector<MachineOperand *, 1> KillFlagCandidates;

Register isCopyFromExec(const MachineInstr &MI) const;
Register isCopyToExec(const MachineInstr &MI) const;
Expand All @@ -41,15 +43,16 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
MachineBasicBlock::reverse_iterator
findExecCopy(MachineBasicBlock &MBB,
MachineBasicBlock::reverse_iterator I) const;

bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
MCRegister Reg, bool UseLiveOuts = false,
bool IgnoreStart = false) const;
bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const;
MachineInstr *findInstrBackwards(MachineInstr &Origin,
std::function<bool(MachineInstr *)> Pred,
ArrayRef<MCRegister> NonModifiableRegs,
unsigned MaxInstructions = 20) const;
MachineInstr *findInstrBackwards(
MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
ArrayRef<MCRegister> NonModifiableRegs,
MachineInstr *Terminator = nullptr,
SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr,
unsigned MaxInstructions = 20) const;
bool optimizeExecSequence();
void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
Expand Down Expand Up @@ -325,11 +328,13 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
// the beginning of the BB is reached or Pred evaluates to true - which can be
// an arbitrary condition based on the current MachineInstr, for instance an
// target instruction. Breaks prematurely by returning nullptr if one of the
// target instruction. Breaks prematurely by returning nullptr if one of the
// registers given in NonModifiableRegs is modified by the current instruction.
MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator,
SmallVectorImpl<MachineOperand *> *KillFlagCandidates,
unsigned MaxInstructions) const {
MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
E = Origin.getParent()->rend();
unsigned CurrentIteration = 0;
Expand All @@ -344,6 +349,21 @@ MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
for (MCRegister Reg : NonModifiableRegs) {
if (A->modifiesRegister(Reg, TRI))
return nullptr;

// Check for kills that appear after the terminator instruction, that
// would not be detected by clearKillFlags, since they will cause the
// register to be dead at a later place, causing the verifier to fail.
// We use the candidates to clear the kill flags later.
if (Terminator && KillFlagCandidates && A != Terminator &&
A->killsRegister(Reg, TRI)) {
for (MachineOperand &MO : A->operands()) {
if (MO.isReg() && MO.isKill()) {
Register Candidate = MO.getReg();
if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg))
KillFlagCandidates->push_back(&MO);
}
}
}
}

++CurrentIteration;
Expand Down Expand Up @@ -599,6 +619,9 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
if (Src1->isReg())
MRI->clearKillFlags(Src1->getReg());

for (MachineOperand *MO : KillFlagCandidates)
MO->setIsKill(false);

SaveExecInstr.eraseFromParent();
VCmp.eraseFromParent();

Expand Down Expand Up @@ -690,7 +713,8 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
NonDefRegs.push_back(Src1->getReg());

if (!findInstrBackwards(
MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs,
VCmp, &KillFlagCandidates))
return;

if (VCmp)
Expand Down Expand Up @@ -777,6 +801,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {

OrXors.clear();
SaveExecVCmpMapping.clear();
KillFlagCandidates.clear();
static unsigned SearchWindow = 10;
for (MachineBasicBlock &MBB : MF) {
unsigned SearchCount = 0;
Expand Down
39 changes: 39 additions & 0 deletions llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX1100 %s

---

name: vcmp_saveexec_to_vcmpx_set_kill
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F

; GFX1100-LABEL: name: vcmp_saveexec_to_vcmpx_set_kill
; GFX1100: liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
; GFX1100-NEXT: renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
; GFX1100-NEXT: renamable $sgpr4 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec
; GFX1100-NEXT: renamable $sgpr0 = S_MOV_B32 0
; GFX1100-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
; GFX1100-NEXT: renamable $sgpr2 = COPY renamable $sgpr0
; GFX1100-NEXT: renamable $sgpr3 = COPY renamable $sgpr0
; GFX1100-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
; GFX1100-NEXT: renamable $sgpr68 = COPY renamable $sgpr66
; GFX1100-NEXT: $sgpr5 = S_MOV_B32 $exec_lo
; GFX1100-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, 0, implicit-def $exec, implicit $exec
renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
renamable $vcc_lo = V_CMP_EQ_U32_e64 $vgpr0, 0, implicit $exec
renamable $sgpr4 = V_CMP_NE_U32_e64 0, killed $vgpr0, implicit $exec
renamable $sgpr0 = S_MOV_B32 0
renamable $sgpr1 = COPY renamable $sgpr0
renamable $sgpr2 = COPY renamable $sgpr0
renamable $sgpr3 = COPY renamable $sgpr0
BUFFER_STORE_DWORDX2_OFFSET_exact killed $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
renamable $sgpr68 = COPY renamable $sgpr66
renamable $sgpr5 = COPY $exec_lo, implicit-def $exec_lo
renamable $sgpr6 = S_AND_B32 renamable $sgpr5, killed renamable $vcc_lo, implicit-def dead $scc
$exec_lo = S_MOV_B32_term killed renamable $sgpr6
...