Skip to content

Commit e7e90dd

Browse files
authored
[AMDGPU] Adding multiple use analysis to SIPeepholeSDWA (#94800)
Allow for multiple uses of an operand where each instruction can be promoted to SDWA. For instance: ; v_and_b32 v2, lit(0x0000ffff), v2 ; v_and_b32 v3, 6, v2 ; v_and_b32 v2, 1, v2 Can be folded to: ; v_and_b32 v3, 6, sel_lo(v2) ; v_and_b32 v2, 1, sel_lo(v2)
1 parent b1932b8 commit e7e90dd

18 files changed

+1250
-1192
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,22 @@ STATISTIC(NumSDWAInstructionsPeepholed,
3737

3838
namespace {
3939

40+
bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
41+
const SIInstrInfo *TII);
4042
class SDWAOperand;
4143
class SDWADstOperand;
4244

43-
class SIPeepholeSDWA : public MachineFunctionPass {
44-
public:
45-
using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
45+
using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
46+
using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;
4647

48+
class SIPeepholeSDWA : public MachineFunctionPass {
4749
private:
4850
MachineRegisterInfo *MRI;
4951
const SIRegisterInfo *TRI;
5052
const SIInstrInfo *TII;
5153

5254
MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
53-
MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
55+
SDWAOperandsMap PotentialMatches;
5456
SmallVector<MachineInstr *, 8> ConvertedInstructions;
5557

5658
std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
@@ -65,7 +67,6 @@ class SIPeepholeSDWA : public MachineFunctionPass {
6567
bool runOnMachineFunction(MachineFunction &MF) override;
6668
void matchSDWAOperands(MachineBasicBlock &MBB);
6769
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
68-
bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
6970
void pseudoOpConvertToVOP2(MachineInstr &MI,
7071
const GCNSubtarget &ST) const;
7172
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
@@ -93,7 +94,9 @@ class SDWAOperand {
9394

9495
virtual ~SDWAOperand() = default;
9596

96-
virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
97+
virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
98+
const GCNSubtarget &ST,
99+
SDWAOperandsMap *PotentialMatches = nullptr) = 0;
97100
virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
98101

99102
MachineOperand *getTargetOperand() const { return Target; }
@@ -126,7 +129,9 @@ class SDWASrcOperand : public SDWAOperand {
126129
: SDWAOperand(TargetOp, ReplacedOp),
127130
SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
128131

129-
MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
132+
MachineInstr *potentialToConvert(const SIInstrInfo *TII,
133+
const GCNSubtarget &ST,
134+
SDWAOperandsMap *PotentialMatches = nullptr) override;
130135
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
131136

132137
SdwaSel getSrcSel() const { return SrcSel; }
@@ -153,7 +158,9 @@ class SDWADstOperand : public SDWAOperand {
153158
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
154159
: SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
155160

156-
MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
161+
MachineInstr *potentialToConvert(const SIInstrInfo *TII,
162+
const GCNSubtarget &ST,
163+
SDWAOperandsMap *PotentialMatches = nullptr) override;
157164
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
158165

159166
SdwaSel getDstSel() const { return DstSel; }
@@ -327,7 +334,33 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
327334
return Mods;
328335
}
329336

330-
MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
337+
MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
338+
const GCNSubtarget &ST,
339+
SDWAOperandsMap *PotentialMatches) {
340+
if (PotentialMatches != nullptr) {
341+
// Fill out the map for all uses if all can be converted
342+
MachineOperand *Reg = getReplacedOperand();
343+
if (!Reg->isReg() || !Reg->isDef())
344+
return nullptr;
345+
346+
for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
347+
// Check that all instructions that use Reg can be converted
348+
if (!isConvertibleToSDWA(UseMI, ST, TII))
349+
return nullptr;
350+
351+
// Now that it's guaranteed all uses are legal, iterate over the uses again
352+
// to add them for later conversion.
353+
for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
354+
// Should not get a subregister here
355+
assert(isSameReg(UseMO, *Reg));
356+
357+
SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
358+
MachineInstr *UseMI = UseMO.getParent();
359+
potentialMatchesMap[UseMI].push_back(this);
360+
}
361+
return nullptr;
362+
}
363+
331364
// For SDWA src operand potential instruction is one that use register
332365
// defined by parent instruction
333366
MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
@@ -420,7 +453,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
420453
return true;
421454
}
422455

423-
MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
456+
MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
457+
const GCNSubtarget &ST,
458+
SDWAOperandsMap *PotentialMatches) {
424459
// For SDWA dst operand potential instruction is one that defines register
425460
// that this operand uses
426461
MachineRegisterInfo *MRI = getMRI();
@@ -919,8 +954,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
919954
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
920955
}
921956

922-
bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
923-
const GCNSubtarget &ST) const {
957+
namespace {
958+
bool isConvertibleToSDWA(MachineInstr &MI,
959+
const GCNSubtarget &ST,
960+
const SIInstrInfo* TII) {
924961
// Check if this is already an SDWA instruction
925962
unsigned Opc = MI.getOpcode();
926963
if (TII->isSDWA(Opc))
@@ -980,6 +1017,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
9801017

9811018
return true;
9821019
}
1020+
} // namespace
9831021

9841022
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
9851023
const SDWAOperandsVector &SDWAOperands) {
@@ -1215,7 +1253,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
12151253
matchSDWAOperands(MBB);
12161254
for (const auto &OperandPair : SDWAOperands) {
12171255
const auto &Operand = OperandPair.second;
1218-
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1256+
MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
12191257
if (PotentialMI &&
12201258
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
12211259
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
@@ -1228,8 +1266,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
12281266

12291267
for (const auto &OperandPair : SDWAOperands) {
12301268
const auto &Operand = OperandPair.second;
1231-
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1232-
if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1269+
MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
1270+
if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
12331271
PotentialMatches[PotentialMI].push_back(Operand.get());
12341272
}
12351273
}

llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -771,36 +771,37 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
771771
; VI: ; %bb.0:
772772
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
773773
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
774-
; VI-NEXT: v_mov_b32_e32 v6, 8
774+
; VI-NEXT: v_mov_b32_e32 v6, 9
775+
; VI-NEXT: v_mov_b32_e32 v7, 8
775776
; VI-NEXT: s_waitcnt lgkmcnt(0)
776777
; VI-NEXT: v_mov_b32_e32 v0, s2
777778
; VI-NEXT: v_mov_b32_e32 v1, s3
778779
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
779780
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
780781
; VI-NEXT: flat_load_dword v1, v[0:1]
781782
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
782-
; VI-NEXT: v_mov_b32_e32 v2, 9
783+
; VI-NEXT: v_mov_b32_e32 v2, 0xff
783784
; VI-NEXT: s_waitcnt lgkmcnt(0)
784785
; VI-NEXT: v_mov_b32_e32 v5, s1
785786
; VI-NEXT: v_mov_b32_e32 v4, s0
786787
; VI-NEXT: s_waitcnt vmcnt(0)
787-
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1
788-
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
788+
; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1
789+
; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
789790
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
790791
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
791792
; VI-NEXT: v_add_u16_e32 v9, 9, v1
792-
; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
793-
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
794-
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
795-
; VI-NEXT: v_add_u16_e32 v7, 9, v7
793+
; VI-NEXT: v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
794+
; VI-NEXT: v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
795+
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
796+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
796797
; VI-NEXT: v_add_u16_e32 v8, 9, v8
797-
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
798798
; VI-NEXT: v_and_b32_e32 v10, 0xff, v10
799-
; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
800-
; VI-NEXT: v_and_b32_e32 v1, 0xff, v8
799+
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
800+
; VI-NEXT: v_and_b32_e32 v6, 0xff, v6
801+
; VI-NEXT: v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
802+
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v10
801803
; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
802-
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
803-
; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10
804+
; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6
804805
; VI-NEXT: v_or_b32_e32 v0, v0, v1
805806
; VI-NEXT: v_or_b32_e32 v2, v0, v2
806807
; VI-NEXT: v_mov_b32_e32 v0, s2

0 commit comments

Comments
 (0)