Skip to content

Commit af0b69f

Browse files
authored
[AMDGPU] Support wide register or subregister access when emitting s_singleuse_vdst instructions. (#88520)
Both single use producer and consumer instructions using wide/sub registers are now correctly tracked and eligible for being marked as single use.
1 parent d3993ac commit af0b69f

File tree

2 files changed

+119
-8
lines changed

2 files changed

+119
-8
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,19 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
6464
bool InstructionEmitted = false;
6565

6666
for (MachineBasicBlock &MBB : MF) {
67-
DenseMap<MCPhysReg, unsigned> RegisterUseCount; // TODO: MCRegUnits
67+
DenseMap<MCRegUnit, unsigned> RegisterUseCount;
6868

6969
// Handle boundaries at the end of basic block separately to avoid
7070
// false positives. If they are live at the end of a basic block then
7171
// assume it has more uses later on.
72-
for (const auto &Liveouts : MBB.liveouts())
73-
RegisterUseCount[Liveouts.PhysReg] = 2;
72+
for (const auto &Liveout : MBB.liveouts()) {
73+
for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
74+
++Units) {
75+
const auto [Unit, Mask] = *Units;
76+
if ((Mask & Liveout.LaneMask).any())
77+
RegisterUseCount[Unit] = 2;
78+
}
79+
}
7480

7581
for (MachineInstr &MI : reverse(MBB.instrs())) {
7682
// All registers in all operands need to be single use for an
@@ -84,7 +90,8 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
8490

8591
// Count the number of times each register is read.
8692
if (Operand.readsReg())
87-
RegisterUseCount[Reg]++;
93+
for (const MCRegUnit &Unit : TRI->regunits(Reg))
94+
RegisterUseCount[Unit]++;
8895

8996
// Do not attempt to optimise across exec mask changes.
9097
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
@@ -96,10 +103,16 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
96103
// check if the operands are single use.
97104
if (!MI.modifiesRegister(Reg, TRI))
98105
continue;
99-
if (RegisterUseCount[Reg] > 1)
106+
107+
const auto RegUnits = TRI->regunits(Reg);
108+
if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit &Unit) {
109+
return RegisterUseCount[Unit] > 1;
110+
}))
100111
AllProducerOperandsAreSingleUse = false;
112+
101113
// Reset uses count when a register is no longer live.
102-
RegisterUseCount.erase(Reg);
114+
for (const MCRegUnit &Unit : RegUnits)
115+
RegisterUseCount.erase(Unit);
103116
}
104117
if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
105118
// TODO: Replace with candidate logging for instruction grouping

llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir

Lines changed: 100 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -521,9 +521,7 @@ body: |
521521
; CHECK-NEXT: {{ $}}
522522
; CHECK-NEXT: S_SINGLEUSE_VDST 1
523523
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
524-
; CHECK-NEXT: S_SINGLEUSE_VDST 1
525524
; CHECK-NEXT: $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
526-
; CHECK-NEXT: S_SINGLEUSE_VDST 1
527525
; CHECK-NEXT: $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
528526
; CHECK-NEXT: {{ $}}
529527
; CHECK-NEXT: bb.1:
@@ -582,6 +580,31 @@ body: |
582580
liveins: $vgpr1
583581
...
584582

583+
# Write low 16-bits and then read 32-bit vgpr twice.
584+
---
585+
name: write_lo_read_full_twice
586+
tracksRegLiveness: true
587+
body: |
588+
; CHECK-LABEL: name: write_lo_read_full_twice
589+
; CHECK: bb.0:
590+
; CHECK-NEXT: successors: %bb.1(0x80000000)
591+
; CHECK-NEXT: liveins: $vgpr0
592+
; CHECK-NEXT: {{ $}}
593+
; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
594+
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
595+
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
596+
; CHECK-NEXT: {{ $}}
597+
; CHECK-NEXT: bb.1:
598+
; CHECK-NEXT: liveins: $vgpr1, $vgpr2
599+
bb.0:
600+
liveins: $vgpr0
601+
$vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
602+
$vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
603+
$vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
604+
bb.1:
605+
liveins: $vgpr1, $vgpr2
606+
...
607+
585608
# Write high 16-bits and then read 32-bit vgpr.
586609
---
587610
name: write_hi_read_full
@@ -605,3 +628,78 @@ body: |
605628
bb.1:
606629
liveins: $vgpr1
607630
...
631+
632+
# Write high 16-bits and then read 32-bit vgpr twice.
633+
---
634+
name: write_hi_read_full_twice
635+
tracksRegLiveness: true
636+
body: |
637+
; CHECK-LABEL: name: write_hi_read_full_twice
638+
; CHECK: bb.0:
639+
; CHECK-NEXT: successors: %bb.1(0x80000000)
640+
; CHECK-NEXT: liveins: $vgpr0
641+
; CHECK-NEXT: {{ $}}
642+
; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
643+
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
644+
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
645+
; CHECK-NEXT: {{ $}}
646+
; CHECK-NEXT: bb.1:
647+
; CHECK-NEXT: liveins: $vgpr1, $vgpr2
648+
bb.0:
649+
liveins: $vgpr0
650+
$vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
651+
$vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
652+
$vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
653+
bb.1:
654+
liveins: $vgpr1, $vgpr2
655+
...
656+
657+
# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr.
658+
---
659+
name: write_both_read_full
660+
tracksRegLiveness: true
661+
body: |
662+
; CHECK-LABEL: name: write_both_read_full
663+
; CHECK: bb.0:
664+
; CHECK-NEXT: successors: %bb.1(0x80000000)
665+
; CHECK-NEXT: {{ $}}
666+
; CHECK-NEXT: S_SINGLEUSE_VDST 1
667+
; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
668+
; CHECK-NEXT: S_SINGLEUSE_VDST 1
669+
; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
670+
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
671+
; CHECK-NEXT: {{ $}}
672+
; CHECK-NEXT: bb.1:
673+
; CHECK-NEXT: liveins: $vgpr1
674+
bb.0:
675+
$vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
676+
$vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
677+
$vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
678+
bb.1:
679+
liveins: $vgpr1
680+
...
681+
682+
# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr twice.
683+
---
684+
name: write_both_read_full_twice
685+
tracksRegLiveness: true
686+
body: |
687+
; CHECK-LABEL: name: write_both_read_full_twice
688+
; CHECK: bb.0:
689+
; CHECK-NEXT: successors: %bb.1(0x80000000)
690+
; CHECK-NEXT: {{ $}}
691+
; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
692+
; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
693+
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
694+
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
695+
; CHECK-NEXT: {{ $}}
696+
; CHECK-NEXT: bb.1:
697+
; CHECK-NEXT: liveins: $vgpr1, $vgpr2
698+
bb.0:
699+
$vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
700+
$vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
701+
$vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
702+
$vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
703+
bb.1:
704+
liveins: $vgpr1, $vgpr2
705+
...

0 commit comments

Comments
 (0)