Skip to content

Commit d09d4f1

Browse files
committed
Add low latency check
Change-Id: Iec36f11060ca1b46b6c33130d4ee02863360c671
1 parent 7e3caf5 commit d09d4f1

File tree

5 files changed

+129
-6
lines changed

5 files changed

+129
-6
lines changed

llvm/include/llvm/CodeGen/TargetInstrInfo.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1806,8 +1806,13 @@ class TargetInstrInfo : public MCInstrInfo {
18061806
unsigned defaultDefLatency(const MCSchedModel &SchedModel,
18071807
const MachineInstr &DefMI) const;
18081808

1809+
/// Return true if this instruction is considered low latency.
1810+
virtual bool isLowLatencyInstruction(const MachineInstr &MI) const {
1811+
return false;
1812+
};
1813+
18091814
/// Return true if this opcode has high latency to its result.
1810-
virtual bool isHighLatencyDef(int opc) const { return false; }
1815+
virtual bool isHighLatencyDef(int opc) const { return false; };
18111816

18121817
/// Compute operand latency between a def of 'Reg'
18131818
/// and a use in the current loop. Return true if the target considered

llvm/lib/CodeGen/MachineSink.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1652,13 +1652,18 @@ bool MachineSinking::aggressivelySinkIntoCycle(
16521652
MachineCycle *Cycle, MachineInstr &I,
16531653
DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
16541654
&SunkInstrs) {
1655+
// TODO: support instructions with multiple defs
1656+
if (I.getNumDefs() > 1)
1657+
return false;
1658+
1659+
// Only sink instructions which the target considers to be low latency
1660+
if (!TII->isLowLatencyInstruction(I))
1661+
return false;
1662+
16551663
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
16561664
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
16571665
assert(Preheader && "Cycle sink needs a preheader block");
16581666
SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;
1659-
// TODO: support instructions with multiple defs
1660-
if (I.getNumDefs() > 1)
1661-
return false;
16621667

16631668
MachineOperand &DefMO = I.getOperand(0);
16641669
for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8676,7 +8676,13 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
86768676
bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
86778677
unsigned Opc = MI.getOpcode();
86788678

8679-
return isSMRD(Opc);
8679+
if (MI.isCopy() || isSMRD(Opc))
8680+
return true;
8681+
8682+
if (SchedModel.hasInstrSchedModel())
8683+
return SchedModel.computeInstrLatency(Opc) < 4;
8684+
8685+
return false;
86808686
}
86818687

86828688
bool SIInstrInfo::isHighLatencyDef(int Opc) const {

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1291,7 +1291,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
12911291
uint64_t getDefaultRsrcDataFormat() const;
12921292
uint64_t getScratchRsrcWords23() const;
12931293

1294-
bool isLowLatencyInstruction(const MachineInstr &MI) const;
1294+
bool isLowLatencyInstruction(const MachineInstr &MI) const override;
12951295
bool isHighLatencyDef(int Opc) const override;
12961296

12971297
/// Return the descriptor of the target-specific machine instruction
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
3+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
4+
5+
---
6+
name: latency_cycle_sink
7+
tracksRegLiveness: true
8+
body: |
9+
; GFX10-SUNK-LABEL: name: latency_cycle_sink
10+
; GFX10-SUNK: bb.0:
11+
; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
12+
; GFX10-SUNK-NEXT: liveins: $vgpr4, $vgpr5
13+
; GFX10-SUNK-NEXT: {{ $}}
14+
; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
15+
; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
16+
; GFX10-SUNK-NEXT: S_BRANCH %bb.1
17+
; GFX10-SUNK-NEXT: {{ $}}
18+
; GFX10-SUNK-NEXT: bb.1:
19+
; GFX10-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
20+
; GFX10-SUNK-NEXT: {{ $}}
21+
; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
22+
; GFX10-SUNK-NEXT: S_BRANCH %bb.2
23+
; GFX10-SUNK-NEXT: {{ $}}
24+
; GFX10-SUNK-NEXT: bb.2:
25+
; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
26+
; GFX10-SUNK-NEXT: {{ $}}
27+
; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
28+
; GFX10-SUNK-NEXT: S_BRANCH %bb.4
29+
; GFX10-SUNK-NEXT: {{ $}}
30+
; GFX10-SUNK-NEXT: bb.3:
31+
; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
32+
; GFX10-SUNK-NEXT: {{ $}}
33+
; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
34+
; GFX10-SUNK-NEXT: S_BRANCH %bb.4
35+
; GFX10-SUNK-NEXT: {{ $}}
36+
; GFX10-SUNK-NEXT: bb.4:
37+
; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
38+
; GFX10-SUNK-NEXT: {{ $}}
39+
; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
40+
; GFX10-SUNK-NEXT: S_BRANCH %bb.5
41+
; GFX10-SUNK-NEXT: {{ $}}
42+
; GFX10-SUNK-NEXT: bb.5:
43+
; GFX10-SUNK-NEXT: S_ENDPGM 0
44+
;
45+
; GFX9-SUNK-LABEL: name: latency_cycle_sink
46+
; GFX9-SUNK: bb.0:
47+
; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
48+
; GFX9-SUNK-NEXT: liveins: $vgpr4, $vgpr5
49+
; GFX9-SUNK-NEXT: {{ $}}
50+
; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
51+
; GFX9-SUNK-NEXT: S_BRANCH %bb.1
52+
; GFX9-SUNK-NEXT: {{ $}}
53+
; GFX9-SUNK-NEXT: bb.1:
54+
; GFX9-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
55+
; GFX9-SUNK-NEXT: {{ $}}
56+
; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
57+
; GFX9-SUNK-NEXT: S_BRANCH %bb.2
58+
; GFX9-SUNK-NEXT: {{ $}}
59+
; GFX9-SUNK-NEXT: bb.2:
60+
; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
61+
; GFX9-SUNK-NEXT: {{ $}}
62+
; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
63+
; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
64+
; GFX9-SUNK-NEXT: S_BRANCH %bb.4
65+
; GFX9-SUNK-NEXT: {{ $}}
66+
; GFX9-SUNK-NEXT: bb.3:
67+
; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
68+
; GFX9-SUNK-NEXT: {{ $}}
69+
; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
70+
; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_2]], 8, [[V_PK_MUL_LO_U16_2]], 0, 0, 0, 0, 0, implicit $exec
71+
; GFX9-SUNK-NEXT: S_BRANCH %bb.4
72+
; GFX9-SUNK-NEXT: {{ $}}
73+
; GFX9-SUNK-NEXT: bb.4:
74+
; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
75+
; GFX9-SUNK-NEXT: {{ $}}
76+
; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
77+
; GFX9-SUNK-NEXT: S_BRANCH %bb.5
78+
; GFX9-SUNK-NEXT: {{ $}}
79+
; GFX9-SUNK-NEXT: bb.5:
80+
; GFX9-SUNK-NEXT: S_ENDPGM 0
81+
bb.0:
82+
successors: %bb.1(0x80000000)
83+
liveins: $vgpr4, $vgpr5
84+
%83:vgpr_32 = IMPLICIT_DEF
85+
%80:vgpr_32 = V_PK_MUL_LO_U16 8, %83, 8, %83, 0, 0, 0, 0, 0, implicit $exec
86+
S_BRANCH %bb.1
87+
88+
bb.1:
89+
S_CBRANCH_SCC1 %bb.3, implicit undef $scc
90+
S_BRANCH %bb.2
91+
92+
93+
bb.2:
94+
%90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
95+
S_BRANCH %bb.4
96+
97+
bb.3:
98+
%90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
99+
S_BRANCH %bb.4
100+
101+
bb.4:
102+
S_CBRANCH_SCC1 %bb.1, implicit undef $scc
103+
S_BRANCH %bb.5
104+
105+
bb.5:
106+
S_ENDPGM 0
107+
...

0 commit comments

Comments
 (0)