Skip to content

Commit 28233b1

Browse files
[AMDGPU] New AMDGPUInsertSingleUseVDST pass (#72388)
Add support for emitting GFX11.5 s_singleuse_vdst instructions. This is a power saving feature whereby the compiler can annotate VALU instructions whose results are known to have only a single use, so the hardware can in some cases avoid writing the result back to VGPR RAM. To begin with the pass is disabled by default because of one missing feature: we need an exclusion list of opcodes that never qualify as single-use producers and/or consumers. A future patch will implement this and enable the pass by default. --------- Co-authored-by: Scott Egerton <[email protected]>
1 parent 17fcad7 commit 28233b1

File tree

5 files changed

+763
-0
lines changed

5 files changed

+763
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,9 @@ extern char &SIModeRegisterID;
335335
void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
336336
extern char &AMDGPUInsertDelayAluID;
337337

338+
void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &);
339+
extern char &AMDGPUInsertSingleUseVDSTID;
340+
338341
void initializeSIInsertHardClausesPass(PassRegistry &);
339342
extern char &SIInsertHardClausesID;
340343

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
11+
/// instructions that produce single-use VGPR values. If the value is forwarded
12+
/// to the consumer instruction prior to VGPR writeback, the hardware can
13+
/// then skip (kill) the VGPR write.
14+
//
15+
//===----------------------------------------------------------------------===//
16+
17+
#include "AMDGPU.h"
18+
#include "GCNSubtarget.h"
19+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20+
#include "SIInstrInfo.h"
21+
#include "llvm/ADT/DenseMap.h"
22+
#include "llvm/ADT/STLExtras.h"
23+
#include "llvm/ADT/StringRef.h"
24+
#include "llvm/CodeGen/MachineBasicBlock.h"
25+
#include "llvm/CodeGen/MachineFunction.h"
26+
#include "llvm/CodeGen/MachineFunctionPass.h"
27+
#include "llvm/CodeGen/MachineInstr.h"
28+
#include "llvm/CodeGen/MachineInstrBuilder.h"
29+
#include "llvm/CodeGen/MachineOperand.h"
30+
#include "llvm/CodeGen/Register.h"
31+
#include "llvm/CodeGen/TargetSubtargetInfo.h"
32+
#include "llvm/IR/DebugLoc.h"
33+
#include "llvm/MC/MCRegister.h"
34+
#include "llvm/Pass.h"
35+
36+
using namespace llvm;
37+
38+
#define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
39+
40+
namespace {
41+
class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
42+
private:
43+
const SIInstrInfo *SII;
44+
45+
public:
46+
static char ID;
47+
48+
AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
49+
50+
void emitSingleUseVDST(MachineInstr &MI) const {
51+
// Mark the following instruction as a single-use producer:
52+
// s_singleuse_vdst { supr0: 1 }
53+
BuildMI(*MI.getParent(), MI, DebugLoc(), SII->get(AMDGPU::S_SINGLEUSE_VDST))
54+
.addImm(0x1);
55+
}
56+
57+
bool runOnMachineFunction(MachineFunction &MF) override {
58+
const auto &ST = MF.getSubtarget<GCNSubtarget>();
59+
if (!ST.hasVGPRSingleUseHintInsts())
60+
return false;
61+
62+
SII = ST.getInstrInfo();
63+
const auto *TRI = &SII->getRegisterInfo();
64+
bool InstructionEmitted = false;
65+
66+
for (MachineBasicBlock &MBB : MF) {
67+
DenseMap<MCPhysReg, unsigned> RegisterUseCount; // TODO: MCRegUnits
68+
69+
// Handle boundaries at the end of basic block separately to avoid
70+
// false positives. If they are live at the end of a basic block then
71+
// assume it has more uses later on.
72+
for (const auto &Liveouts : MBB.liveouts())
73+
RegisterUseCount[Liveouts.PhysReg] = 2;
74+
75+
for (MachineInstr &MI : reverse(MBB.instrs())) {
76+
// All registers in all operands need to be single use for an
77+
// instruction to be marked as a single use producer.
78+
bool AllProducerOperandsAreSingleUse = true;
79+
80+
for (const auto &Operand : MI.operands()) {
81+
if (!Operand.isReg())
82+
continue;
83+
const auto Reg = Operand.getReg();
84+
85+
// Count the number of times each register is read.
86+
if (Operand.readsReg())
87+
RegisterUseCount[Reg]++;
88+
89+
// Do not attempt to optimise across exec mask changes.
90+
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
91+
for (auto &UsedReg : RegisterUseCount)
92+
UsedReg.second = 2;
93+
}
94+
95+
// If we are at the point where the register first became live,
96+
// check if the operands are single use.
97+
if (!MI.modifiesRegister(Reg, TRI))
98+
continue;
99+
if (RegisterUseCount[Reg] > 1)
100+
AllProducerOperandsAreSingleUse = false;
101+
// Reset uses count when a register is no longer live.
102+
RegisterUseCount.erase(Reg);
103+
}
104+
if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
105+
// TODO: Replace with candidate logging for instruction grouping
106+
// later.
107+
emitSingleUseVDST(MI);
108+
InstructionEmitted = true;
109+
}
110+
}
111+
}
112+
return InstructionEmitted;
113+
}
114+
};
115+
} // namespace
116+
117+
char AMDGPUInsertSingleUseVDST::ID = 0;
118+
119+
char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
120+
121+
INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
122+
"AMDGPU Insert SingleUseVDST", false, false)

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,12 @@ static cl::opt<bool> EnableSIModeRegisterPass(
286286
cl::init(true),
287287
cl::Hidden);
288288

289+
// Enable GFX11.5+ s_singleuse_vdst insertion
290+
static cl::opt<bool>
291+
EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
292+
cl::desc("Enable s_singleuse_vdst insertion"),
293+
cl::init(false), cl::Hidden);
294+
289295
// Enable GFX11+ s_delay_alu insertion
290296
static cl::opt<bool>
291297
EnableInsertDelayAlu("amdgpu-enable-delay-alu",
@@ -404,6 +410,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
404410
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
405411
initializeAMDGPUUnifyMetadataPass(*PR);
406412
initializeSIAnnotateControlFlowPass(*PR);
413+
initializeAMDGPUInsertSingleUseVDSTPass(*PR);
407414
initializeAMDGPUInsertDelayAluPass(*PR);
408415
initializeSIInsertHardClausesPass(*PR);
409416
initializeSIInsertWaitcntsPass(*PR);
@@ -1448,6 +1455,9 @@ void GCNPassConfig::addPreEmitPass() {
14481455
// cases.
14491456
addPass(&PostRAHazardRecognizerID);
14501457

1458+
if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
1459+
addPass(&AMDGPUInsertSingleUseVDSTID);
1460+
14511461
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
14521462
addPass(&AMDGPUInsertDelayAluID);
14531463

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ add_llvm_target(AMDGPUCodeGen
7777
AMDGPUMacroFusion.cpp
7878
AMDGPUMCInstLower.cpp
7979
AMDGPUIGroupLP.cpp
80+
AMDGPUInsertSingleUseVDST.cpp
8081
AMDGPUMIRFormatter.cpp
8182
AMDGPUOpenCLEnqueuedBlockLowering.cpp
8283
AMDGPUPerfHintAnalysis.cpp

0 commit comments

Comments
 (0)