Skip to content

[AMDGPU][SILoadStoreOptimizer] Include constrained buffer load variants #101619

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 69 additions & 12 deletions llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
Expand All @@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
Expand All @@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
Expand All @@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return 8;
Expand Down Expand Up @@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return S_BUFFER_LOAD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
Expand Down Expand Up @@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
Expand Down Expand Up @@ -703,13 +727,21 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
Result.SOffset = true;
[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM:
Expand Down Expand Up @@ -1679,6 +1711,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
return New;
}

static bool needsConstrainedOpcode(const GCNSubtarget &STM,
ArrayRef<MachineMemOperand *> MMOs,
unsigned Width) {
// Conservatively returns true if not found the MMO.
return STM.isXNACKEnabled() &&
(MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
}

unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;
Expand All @@ -1696,38 +1736,55 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,

case UNKNOWN:
llvm_unreachable("Unknown instruction class");
case S_BUFFER_LOAD_IMM:
case S_BUFFER_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
bool NeedsConstrainedOpc =
needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
case 2:
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
: AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
case 3:
return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
: AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
: AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:
return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
: AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
case S_BUFFER_LOAD_SGPR_IMM:
}
case S_BUFFER_LOAD_SGPR_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
bool NeedsConstrainedOpc =
needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
case 2:
return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
: AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
case 3:
return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
: AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
: AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
: AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
}
case S_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
bool NeedsConstrainedOpc =
STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
Expand Down
56 changes: 38 additions & 18 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -523,14 +523,23 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
; GFX67-NEXT: exp mrt0 v0, v1, v0, v0 done vm
; GFX67-NEXT: s_endpgm
;
; GFX8910-LABEL: s_buffer_load_imm_mergex2:
; GFX8910: ; %bb.0: ; %main_body
; GFX8910-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
; GFX8910-NEXT: v_mov_b32_e32 v0, s0
; GFX8910-NEXT: v_mov_b32_e32 v1, s1
; GFX8910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
; GFX8910-NEXT: s_endpgm
; GFX8-LABEL: s_buffer_load_imm_mergex2:
; GFX8: ; %bb.0: ; %main_body
; GFX8-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: exp mrt0 v0, v1, v0, v0 done vm
; GFX8-NEXT: s_endpgm
;
; GFX910-LABEL: s_buffer_load_imm_mergex2:
; GFX910: ; %bb.0: ; %main_body
; GFX910-NEXT: s_buffer_load_dwordx2 s[4:5], s[0:3], 0x4
; GFX910-NEXT: s_waitcnt lgkmcnt(0)
; GFX910-NEXT: v_mov_b32_e32 v0, s4
; GFX910-NEXT: v_mov_b32_e32 v1, s5
; GFX910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
; GFX910-NEXT: s_endpgm
;
; GFX11-LABEL: s_buffer_load_imm_mergex2:
; GFX11: ; %bb.0: ; %main_body
Expand Down Expand Up @@ -570,16 +579,27 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
; GFX67-NEXT: exp mrt0 v0, v1, v2, v3 done vm
; GFX67-NEXT: s_endpgm
;
; GFX8910-LABEL: s_buffer_load_imm_mergex4:
; GFX8910: ; %bb.0: ; %main_body
; GFX8910-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
; GFX8910-NEXT: v_mov_b32_e32 v0, s0
; GFX8910-NEXT: v_mov_b32_e32 v1, s1
; GFX8910-NEXT: v_mov_b32_e32 v2, s2
; GFX8910-NEXT: v_mov_b32_e32 v3, s3
; GFX8910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
; GFX8910-NEXT: s_endpgm
; GFX8-LABEL: s_buffer_load_imm_mergex4:
; GFX8: ; %bb.0: ; %main_body
; GFX8-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: exp mrt0 v0, v1, v2, v3 done vm
; GFX8-NEXT: s_endpgm
;
; GFX910-LABEL: s_buffer_load_imm_mergex4:
; GFX910: ; %bb.0: ; %main_body
; GFX910-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x8
; GFX910-NEXT: s_waitcnt lgkmcnt(0)
; GFX910-NEXT: v_mov_b32_e32 v0, s4
; GFX910-NEXT: v_mov_b32_e32 v1, s5
; GFX910-NEXT: v_mov_b32_e32 v2, s6
; GFX910-NEXT: v_mov_b32_e32 v3, s7
; GFX910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
; GFX910-NEXT: s_endpgm
;
; GFX11-LABEL: s_buffer_load_imm_mergex4:
; GFX11: ; %bb.0: ; %main_body
Expand Down
Loading
Loading