Skip to content

Commit 1a54671

Browse files
committed
[MachineScheduler] Track physical register dependencies per-regunit
Change the scheduler's physical register dependency tracking from registers-and-their-aliases to regunits. This has a couple of advantages when subregisters are used: - The dependency tracking is more accurate and creates fewer useless edges in the dependency graph. An AMDGPU example, edited for clarity: SU(0): $vgpr1 = V_MOV_B32 $sgpr0 SU(1): $vgpr1 = V_ADDC_U32 0, $vgpr1 SU(2): $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0 There is a data dependency on $vgpr1 from SU(0) to SU(1) and from SU(1) to SU(2). But the old dependency tracking code also added a useless edge from SU(0) to SU(2) because it thought that SU(0)'s def of $vgpr1 aliased with SU(2)'s use of $vgpr0_vgpr1. - On targets like AMDGPU that make heavy use of subregisters, each register can have a huge number of aliases - it can be quadratic in the size of the largest defined register tuple. There is a much lower bound on the number of regunits per register, so iterating over regunits is faster than iterating over aliases. The LLVM compile-time tracker shows a tiny overall improvement of 0.03% on X86. I expect a larger compile-time improvement on targets like AMDGPU. Differential Revision: https://reviews.llvm.org/D156552
1 parent 5a64c89 commit 1a54671

File tree

12 files changed

+190
-190
lines changed

12 files changed

+190
-190
lines changed

llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,12 @@ namespace llvm {
7777
struct PhysRegSUOper {
7878
SUnit *SU;
7979
int OpIdx;
80-
unsigned Reg;
80+
unsigned RegUnit;
8181

82-
PhysRegSUOper(SUnit *su, int op, unsigned R): SU(su), OpIdx(op), Reg(R) {}
82+
PhysRegSUOper(SUnit *su, int op, unsigned R)
83+
: SU(su), OpIdx(op), RegUnit(R) {}
8384

84-
unsigned getSparseSetIndex() const { return Reg; }
85+
unsigned getSparseSetIndex() const { return RegUnit; }
8586
};
8687

8788
/// Use a SparseMultiSet to track physical registers. Storage is only

llvm/lib/CodeGen/ScheduleDAGInstrs.cpp

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,8 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
211211
for (const MachineOperand &MO : ExitMI->all_uses()) {
212212
Register Reg = MO.getReg();
213213
if (Reg.isPhysical()) {
214-
Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
214+
for (MCRegUnit Unit : TRI->regunits(Reg))
215+
Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
215216
} else if (Reg.isVirtual() && MO.readsReg()) {
216217
addVRegUseDeps(&ExitSU, MO.getOperandNo());
217218
}
@@ -222,8 +223,11 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
222223
// uses all the registers that are livein to the successor blocks.
223224
for (const MachineBasicBlock *Succ : BB->successors()) {
224225
for (const auto &LI : Succ->liveins()) {
225-
if (!Uses.contains(LI.PhysReg))
226-
Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg));
226+
// TODO: Use LI.LaneMask to refine this.
227+
for (MCRegUnit Unit : TRI->regunits(LI.PhysReg)) {
228+
if (!Uses.contains(Unit))
229+
Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
230+
}
227231
}
228232
}
229233
}
@@ -244,8 +248,8 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
244248
const MCInstrDesc &DefMIDesc = SU->getInstr()->getDesc();
245249
bool ImplicitPseudoDef = (OperIdx >= DefMIDesc.getNumOperands() &&
246250
!DefMIDesc.hasImplicitDefOfPhysReg(Reg));
247-
for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
248-
for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
251+
for (MCRegUnit Unit : TRI->regunits(Reg)) {
252+
for (Reg2SUnitsMap::iterator I = Uses.find(Unit); I != Uses.end(); ++I) {
249253
SUnit *UseSU = I->SU;
250254
if (UseSU == SU)
251255
continue;
@@ -262,11 +266,14 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
262266
// Set the hasPhysRegDefs only for physreg defs that have a use within
263267
// the scheduling region.
264268
SU->hasPhysRegDefs = true;
269+
265270
UseInstr = UseSU->getInstr();
271+
Register UseReg = UseInstr->getOperand(UseOpIdx).getReg();
266272
const MCInstrDesc &UseMIDesc = UseInstr->getDesc();
267-
ImplicitPseudoUse = (UseOpIdx >= ((int)UseMIDesc.getNumOperands()) &&
268-
!UseMIDesc.hasImplicitUseOfPhysReg(*Alias));
269-
Dep = SDep(SU, SDep::Data, *Alias);
273+
ImplicitPseudoUse = UseOpIdx >= ((int)UseMIDesc.getNumOperands()) &&
274+
!UseMIDesc.hasImplicitUseOfPhysReg(UseReg);
275+
276+
Dep = SDep(SU, SDep::Data, UseReg);
270277
}
271278
if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
272279
Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
@@ -300,15 +307,16 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
300307
// TODO: Using a latency of 1 here for output dependencies assumes
301308
// there's no cost for reusing registers.
302309
SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
303-
for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
304-
for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) {
310+
for (MCRegUnit Unit : TRI->regunits(Reg)) {
311+
for (Reg2SUnitsMap::iterator I = Defs.find(Unit); I != Defs.end(); ++I) {
305312
SUnit *DefSU = I->SU;
306313
if (DefSU == &ExitSU)
307314
continue;
308315
MachineInstr *DefInstr = DefSU->getInstr();
309-
if (DefSU != SU && (Kind != SDep::Output || !MO.isDead() ||
310-
!DefInstr->registerDefIsDead(*Alias))) {
311-
SDep Dep(SU, Kind, /*Reg=*/*Alias);
316+
MachineOperand &DefMO = DefInstr->getOperand(I->OpIdx);
317+
if (DefSU != SU &&
318+
(Kind != SDep::Output || !MO.isDead() || !DefMO.isDead())) {
319+
SDep Dep(SU, Kind, DefMO.getReg());
312320
if (Kind != SDep::Anti) {
313321
Dep.setLatency(
314322
SchedModel.computeOutputLatency(MI, OperIdx, DefInstr));
@@ -324,37 +332,42 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
324332
// Either insert a new Reg2SUnits entry with an empty SUnits list, or
325333
// retrieve the existing SUnits list for this register's uses.
326334
// Push this SUnit on the use list.
327-
Uses.insert(PhysRegSUOper(SU, OperIdx, Reg));
335+
for (MCRegUnit Unit : TRI->regunits(Reg))
336+
Uses.insert(PhysRegSUOper(SU, OperIdx, Unit));
328337
if (RemoveKillFlags)
329338
MO.setIsKill(false);
330339
} else {
331340
addPhysRegDataDeps(SU, OperIdx);
332341

333342
// Clear previous uses and defs of this register and its subregisters.
334-
for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) {
335-
Uses.eraseAll(SubReg);
343+
for (MCRegUnit Unit : TRI->regunits(Reg)) {
344+
Uses.eraseAll(Unit);
336345
if (!MO.isDead())
337-
Defs.eraseAll(SubReg);
346+
Defs.eraseAll(Unit);
338347
}
348+
339349
if (MO.isDead() && SU->isCall) {
340350
// Calls will not be reordered because of chain dependencies (see
341351
// below). Since call operands are dead, calls may continue to be added
342352
// to the DefList making dependence checking quadratic in the size of
343353
// the block. Instead, we leave only one call at the back of the
344354
// DefList.
345-
Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg);
346-
Reg2SUnitsMap::iterator B = P.first;
347-
Reg2SUnitsMap::iterator I = P.second;
348-
for (bool isBegin = I == B; !isBegin; /* empty */) {
349-
isBegin = (--I) == B;
350-
if (!I->SU->isCall)
351-
break;
352-
I = Defs.erase(I);
355+
for (MCRegUnit Unit : TRI->regunits(Reg)) {
356+
Reg2SUnitsMap::RangePair P = Defs.equal_range(Unit);
357+
Reg2SUnitsMap::iterator B = P.first;
358+
Reg2SUnitsMap::iterator I = P.second;
359+
for (bool isBegin = I == B; !isBegin; /* empty */) {
360+
isBegin = (--I) == B;
361+
if (!I->SU->isCall)
362+
break;
363+
I = Defs.erase(I);
364+
}
353365
}
354366
}
355367

356368
// Defs are pushed in the order they are visited and never reordered.
357-
Defs.insert(PhysRegSUOper(SU, OperIdx, Reg));
369+
for (MCRegUnit Unit : TRI->regunits(Reg))
370+
Defs.insert(PhysRegSUOper(SU, OperIdx, Unit));
358371
}
359372
}
360373

llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,11 +1363,11 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
13631363
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
13641364
; VI-NEXT: s_waitcnt vmcnt(0)
13651365
; VI-NEXT: v_mov_b32_e32 v1, s7
1366-
; VI-NEXT: s_movk_i32 s0, 0x7fff
13671366
; VI-NEXT: flat_load_ushort v3, v[0:1]
1368-
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1367+
; VI-NEXT: s_movk_i32 s0, 0x7fff
13691368
; VI-NEXT: v_mov_b32_e32 v0, s4
13701369
; VI-NEXT: v_mov_b32_e32 v1, s5
1370+
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
13711371
; VI-NEXT: s_waitcnt vmcnt(0)
13721372
; VI-NEXT: v_bfi_b32 v2, s0, v3, v2
13731373
; VI-NEXT: flat_store_short v[0:1], v2

llvm/test/CodeGen/AMDGPU/load-global-i16.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3788,13 +3788,13 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
37883788
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
37893789
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v3
37903790
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v2
3791+
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v1
3792+
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0
37913793
; GCN-NOHSA-VI-NEXT: buffer_store_dword v32, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
37923794
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
37933795
; GCN-NOHSA-VI-NEXT: buffer_store_dword v33, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
37943796
; GCN-NOHSA-VI-NEXT: buffer_store_dword v34, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
37953797
; GCN-NOHSA-VI-NEXT: buffer_store_dword v35, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
3796-
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v1
3797-
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0
37983798
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v1
37993799
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v0
38003800
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v29

llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,11 @@
44
# CHECK: SU(0): $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec
55
# CHECK: Successors:
66
# CHECK-NEXT: SU(2): Out Latency=1
7-
# CHECK-NEXT: SU(4): Out Latency=1
87
# CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0
9-
# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
108
# CHECK: SU(1): $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec
119
# CHECK: Successors:
1210
# CHECK-NEXT: SU(3): Out Latency=1
13-
# CHECK-NEXT: SU(4): Out Latency=1
1411
# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr1
15-
# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
1612
# CHECK: SU(2): $vgpr0 = V_ADD_CO_U32_e32 $sgpr2, $vgpr0, implicit-def $vcc, implicit $exec
1713
# CHECK: Predecessors:
1814
# CHECK-NEXT: SU(0): Out Latency=1
@@ -22,7 +18,6 @@
2218
# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
2319
# CHECK-NEXT: SU(3): Out Latency=1
2420
# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vcc
25-
# CHECK-NEXT: SU(4): Anti Latency=0
2621
# CHECK: SU(3): $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def dead $vcc, implicit $vcc, implicit $exec
2722
# CHECK: Predecessors:
2823
# CHECK-NEXT: SU(2): Out Latency=1
@@ -32,19 +27,12 @@
3227
# CHECK: Successors:
3328
# CHECK-NEXT: SU(4): Out Latency=1
3429
# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
35-
# CHECK-NEXT: SU(4): Anti Latency=0
3630
# CHECK: SU(4): $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
3731
# CHECK: Predecessors:
3832
# CHECK-NEXT: SU(3): Out Latency=1
3933
# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr0_vgpr1
40-
# CHECK-NEXT: SU(3): Anti Latency=0
4134
# CHECK-NEXT: SU(2): Out Latency=1
4235
# CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0_vgpr1
43-
# CHECK-NEXT: SU(2): Anti Latency=0
44-
# CHECK-NEXT: SU(1): Out Latency=1
45-
# CHECK-NEXT: SU(1): Data Latency=1 Reg=$vgpr0_vgpr1
46-
# CHECK-NEXT: SU(0): Out Latency=1
47-
# CHECK-NEXT: SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
4836
# CHECK: Successors:
4937
# CHECK-NEXT: ExitSU: Ord Latency=3 Artificial
5038

0 commit comments

Comments
 (0)