Skip to content

Commit 37d7b06

Browse files
authored
[AMDGPU][SILoadStoreOptimizer] Include constrained buffer load variants (#101619)
Use the constrained buffer load opcodes while combining under-aligned loads for XNACK enabled subtargets.
1 parent 19f3794 commit 37d7b06

File tree

3 files changed

+619
-82
lines changed

3 files changed

+619
-82
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 69 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
352352
return 1;
353353
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354354
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
356+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
355357
case AMDGPU::S_LOAD_DWORDX2_IMM:
356358
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
357359
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363365
return 2;
364366
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
365367
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
368+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
369+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
366370
case AMDGPU::S_LOAD_DWORDX3_IMM:
367371
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
368372
case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
374378
return 3;
375379
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376380
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
381+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
382+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
377383
case AMDGPU::S_LOAD_DWORDX4_IMM:
378384
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
379385
case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
385391
return 4;
386392
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387393
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
394+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
395+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
388396
case AMDGPU::S_LOAD_DWORDX8_IMM:
389397
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390398
return 8;
@@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
499507
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
500508
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501509
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
510+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
511+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
512+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
513+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
502514
return S_BUFFER_LOAD_IMM;
503515
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504516
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
505517
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506518
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507519
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
520+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
521+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
522+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
523+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
508524
return S_BUFFER_LOAD_SGPR_IMM;
509525
case AMDGPU::S_LOAD_DWORD_IMM:
510526
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
587603
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
588604
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589605
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
606+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
607+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
608+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
609+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
590610
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591611
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592612
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
593613
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594614
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595615
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
616+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
617+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
618+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
619+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
596620
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597621
case AMDGPU::S_LOAD_DWORD_IMM:
598622
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -703,13 +727,21 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703727
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704728
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705729
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
730+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
731+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
732+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
733+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
706734
Result.SOffset = true;
707735
[[fallthrough]];
708736
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
709737
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
710738
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
711739
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712740
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
741+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
742+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
743+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
744+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
713745
case AMDGPU::S_LOAD_DWORD_IMM:
714746
case AMDGPU::S_LOAD_DWORDX2_IMM:
715747
case AMDGPU::S_LOAD_DWORDX3_IMM:
@@ -1679,6 +1711,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
16791711
return New;
16801712
}
16811713

1714+
static bool needsConstrainedOpcode(const GCNSubtarget &STM,
1715+
ArrayRef<MachineMemOperand *> MMOs,
1716+
unsigned Width) {
1717+
// Conservatively returns true if not found the MMO.
1718+
return STM.isXNACKEnabled() &&
1719+
(MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1720+
}
1721+
16821722
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
16831723
const CombineInfo &Paired) {
16841724
const unsigned Width = CI.Width + Paired.Width;
@@ -1696,38 +1736,55 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
16961736

16971737
case UNKNOWN:
16981738
llvm_unreachable("Unknown instruction class");
1699-
case S_BUFFER_LOAD_IMM:
1739+
case S_BUFFER_LOAD_IMM: {
1740+
// If XNACK is enabled, use the constrained opcodes when the first load is
1741+
// under-aligned.
1742+
bool NeedsConstrainedOpc =
1743+
needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
17001744
switch (Width) {
17011745
default:
17021746
return 0;
17031747
case 2:
1704-
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1748+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1749+
: AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
17051750
case 3:
1706-
return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1751+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1752+
: AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
17071753
case 4:
1708-
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1754+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1755+
: AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
17091756
case 8:
1710-
return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1757+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1758+
: AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
17111759
}
1712-
case S_BUFFER_LOAD_SGPR_IMM:
1760+
}
1761+
case S_BUFFER_LOAD_SGPR_IMM: {
1762+
// If XNACK is enabled, use the constrained opcodes when the first load is
1763+
// under-aligned.
1764+
bool NeedsConstrainedOpc =
1765+
needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
17131766
switch (Width) {
17141767
default:
17151768
return 0;
17161769
case 2:
1717-
return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1770+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1771+
: AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
17181772
case 3:
1719-
return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1773+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1774+
: AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
17201775
case 4:
1721-
return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1776+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1777+
: AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
17221778
case 8:
1723-
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1779+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1780+
: AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
17241781
}
1782+
}
17251783
case S_LOAD_IMM: {
17261784
// If XNACK is enabled, use the constrained opcodes when the first load is
17271785
// under-aligned.
1728-
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
17291786
bool NeedsConstrainedOpc =
1730-
STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1787+
needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
17311788
switch (Width) {
17321789
default:
17331790
return 0;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -523,14 +523,23 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
523523
; GFX67-NEXT: exp mrt0 v0, v1, v0, v0 done vm
524524
; GFX67-NEXT: s_endpgm
525525
;
526-
; GFX8910-LABEL: s_buffer_load_imm_mergex2:
527-
; GFX8910: ; %bb.0: ; %main_body
528-
; GFX8910-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
529-
; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
530-
; GFX8910-NEXT: v_mov_b32_e32 v0, s0
531-
; GFX8910-NEXT: v_mov_b32_e32 v1, s1
532-
; GFX8910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
533-
; GFX8910-NEXT: s_endpgm
526+
; GFX8-LABEL: s_buffer_load_imm_mergex2:
527+
; GFX8: ; %bb.0: ; %main_body
528+
; GFX8-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
529+
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
530+
; GFX8-NEXT: v_mov_b32_e32 v0, s0
531+
; GFX8-NEXT: v_mov_b32_e32 v1, s1
532+
; GFX8-NEXT: exp mrt0 v0, v1, v0, v0 done vm
533+
; GFX8-NEXT: s_endpgm
534+
;
535+
; GFX910-LABEL: s_buffer_load_imm_mergex2:
536+
; GFX910: ; %bb.0: ; %main_body
537+
; GFX910-NEXT: s_buffer_load_dwordx2 s[4:5], s[0:3], 0x4
538+
; GFX910-NEXT: s_waitcnt lgkmcnt(0)
539+
; GFX910-NEXT: v_mov_b32_e32 v0, s4
540+
; GFX910-NEXT: v_mov_b32_e32 v1, s5
541+
; GFX910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
542+
; GFX910-NEXT: s_endpgm
534543
;
535544
; GFX11-LABEL: s_buffer_load_imm_mergex2:
536545
; GFX11: ; %bb.0: ; %main_body
@@ -570,16 +579,27 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
570579
; GFX67-NEXT: exp mrt0 v0, v1, v2, v3 done vm
571580
; GFX67-NEXT: s_endpgm
572581
;
573-
; GFX8910-LABEL: s_buffer_load_imm_mergex4:
574-
; GFX8910: ; %bb.0: ; %main_body
575-
; GFX8910-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
576-
; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
577-
; GFX8910-NEXT: v_mov_b32_e32 v0, s0
578-
; GFX8910-NEXT: v_mov_b32_e32 v1, s1
579-
; GFX8910-NEXT: v_mov_b32_e32 v2, s2
580-
; GFX8910-NEXT: v_mov_b32_e32 v3, s3
581-
; GFX8910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
582-
; GFX8910-NEXT: s_endpgm
582+
; GFX8-LABEL: s_buffer_load_imm_mergex4:
583+
; GFX8: ; %bb.0: ; %main_body
584+
; GFX8-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
585+
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
586+
; GFX8-NEXT: v_mov_b32_e32 v0, s0
587+
; GFX8-NEXT: v_mov_b32_e32 v1, s1
588+
; GFX8-NEXT: v_mov_b32_e32 v2, s2
589+
; GFX8-NEXT: v_mov_b32_e32 v3, s3
590+
; GFX8-NEXT: exp mrt0 v0, v1, v2, v3 done vm
591+
; GFX8-NEXT: s_endpgm
592+
;
593+
; GFX910-LABEL: s_buffer_load_imm_mergex4:
594+
; GFX910: ; %bb.0: ; %main_body
595+
; GFX910-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x8
596+
; GFX910-NEXT: s_waitcnt lgkmcnt(0)
597+
; GFX910-NEXT: v_mov_b32_e32 v0, s4
598+
; GFX910-NEXT: v_mov_b32_e32 v1, s5
599+
; GFX910-NEXT: v_mov_b32_e32 v2, s6
600+
; GFX910-NEXT: v_mov_b32_e32 v3, s7
601+
; GFX910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
602+
; GFX910-NEXT: s_endpgm
583603
;
584604
; GFX11-LABEL: s_buffer_load_imm_mergex4:
585605
; GFX11: ; %bb.0: ; %main_body

0 commit comments

Comments
 (0)