Skip to content

Commit ad8a8df

Browse files
committed
[AMDGPU][SILoadStoreOptimizer] Include constrained buffer load variants
Use the constrained buffer load opcodes while combining under-aligned load for XNACK enabled subtargets.
1 parent 38394b9 commit ad8a8df

File tree

3 files changed

+613
-82
lines changed

3 files changed

+613
-82
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 63 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
352352
return 1;
353353
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354354
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
356+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
355357
case AMDGPU::S_LOAD_DWORDX2_IMM:
356358
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
357359
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363365
return 2;
364366
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
365367
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
368+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
369+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
366370
case AMDGPU::S_LOAD_DWORDX3_IMM:
367371
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
368372
case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
374378
return 3;
375379
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376380
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
381+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
382+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
377383
case AMDGPU::S_LOAD_DWORDX4_IMM:
378384
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
379385
case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
385391
return 4;
386392
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387393
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
394+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
395+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
388396
case AMDGPU::S_LOAD_DWORDX8_IMM:
389397
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390398
return 8;
@@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
499507
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
500508
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501509
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
510+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
511+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
512+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
513+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
502514
return S_BUFFER_LOAD_IMM;
503515
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504516
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
505517
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506518
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507519
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
520+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
521+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
522+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
523+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
508524
return S_BUFFER_LOAD_SGPR_IMM;
509525
case AMDGPU::S_LOAD_DWORD_IMM:
510526
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
587603
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
588604
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589605
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
606+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
607+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
608+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
609+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
590610
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591611
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592612
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
593613
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594614
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595615
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
616+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
617+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
618+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
619+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
596620
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597621
case AMDGPU::S_LOAD_DWORD_IMM:
598622
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -703,13 +727,21 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703727
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704728
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705729
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
730+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
731+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
732+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
733+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
706734
Result.SOffset = true;
707735
[[fallthrough]];
708736
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
709737
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
710738
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
711739
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712740
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
741+
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
742+
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
743+
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
744+
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
713745
case AMDGPU::S_LOAD_DWORD_IMM:
714746
case AMDGPU::S_LOAD_DWORDX2_IMM:
715747
case AMDGPU::S_LOAD_DWORDX3_IMM:
@@ -1679,6 +1711,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
16791711
return New;
16801712
}
16811713

1714+
static bool needsConstraintedOpcode(const GCNSubtarget &STM,
1715+
const MachineMemOperand *MMO,
1716+
unsigned Width) {
1717+
return STM.isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1718+
}
1719+
16821720
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
16831721
const CombineInfo &Paired) {
16841722
const unsigned Width = CI.Width + Paired.Width;
@@ -1696,38 +1734,51 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
16961734

16971735
case UNKNOWN:
16981736
llvm_unreachable("Unknown instruction class");
1699-
case S_BUFFER_LOAD_IMM:
1737+
case S_BUFFER_LOAD_IMM: {
1738+
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1739+
bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
17001740
switch (Width) {
17011741
default:
17021742
return 0;
17031743
case 2:
1704-
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1744+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1745+
: AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
17051746
case 3:
1706-
return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1747+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1748+
: AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
17071749
case 4:
1708-
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1750+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1751+
: AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
17091752
case 8:
1710-
return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1753+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1754+
: AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
17111755
}
1712-
case S_BUFFER_LOAD_SGPR_IMM:
1756+
}
1757+
case S_BUFFER_LOAD_SGPR_IMM: {
1758+
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1759+
bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
17131760
switch (Width) {
17141761
default:
17151762
return 0;
17161763
case 2:
1717-
return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1764+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1765+
: AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
17181766
case 3:
1719-
return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1767+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1768+
: AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
17201769
case 4:
1721-
return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1770+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1771+
: AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
17221772
case 8:
1723-
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1773+
return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1774+
: AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
17241775
}
1776+
}
17251777
case S_LOAD_IMM: {
17261778
// If XNACK is enabled, use the constrained opcodes when the first load is
17271779
// under-aligned.
17281780
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1729-
bool NeedsConstrainedOpc =
1730-
STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1781+
bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width);
17311782
switch (Width) {
17321783
default:
17331784
return 0;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -523,14 +523,23 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
523523
; GFX67-NEXT: exp mrt0 v0, v1, v0, v0 done vm
524524
; GFX67-NEXT: s_endpgm
525525
;
526-
; GFX8910-LABEL: s_buffer_load_imm_mergex2:
527-
; GFX8910: ; %bb.0: ; %main_body
528-
; GFX8910-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
529-
; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
530-
; GFX8910-NEXT: v_mov_b32_e32 v0, s0
531-
; GFX8910-NEXT: v_mov_b32_e32 v1, s1
532-
; GFX8910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
533-
; GFX8910-NEXT: s_endpgm
526+
; GFX8-LABEL: s_buffer_load_imm_mergex2:
527+
; GFX8: ; %bb.0: ; %main_body
528+
; GFX8-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4
529+
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
530+
; GFX8-NEXT: v_mov_b32_e32 v0, s0
531+
; GFX8-NEXT: v_mov_b32_e32 v1, s1
532+
; GFX8-NEXT: exp mrt0 v0, v1, v0, v0 done vm
533+
; GFX8-NEXT: s_endpgm
534+
;
535+
; GFX910-LABEL: s_buffer_load_imm_mergex2:
536+
; GFX910: ; %bb.0: ; %main_body
537+
; GFX910-NEXT: s_buffer_load_dwordx2 s[4:5], s[0:3], 0x4
538+
; GFX910-NEXT: s_waitcnt lgkmcnt(0)
539+
; GFX910-NEXT: v_mov_b32_e32 v0, s4
540+
; GFX910-NEXT: v_mov_b32_e32 v1, s5
541+
; GFX910-NEXT: exp mrt0 v0, v1, v0, v0 done vm
542+
; GFX910-NEXT: s_endpgm
534543
;
535544
; GFX11-LABEL: s_buffer_load_imm_mergex2:
536545
; GFX11: ; %bb.0: ; %main_body
@@ -570,16 +579,27 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
570579
; GFX67-NEXT: exp mrt0 v0, v1, v2, v3 done vm
571580
; GFX67-NEXT: s_endpgm
572581
;
573-
; GFX8910-LABEL: s_buffer_load_imm_mergex4:
574-
; GFX8910: ; %bb.0: ; %main_body
575-
; GFX8910-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
576-
; GFX8910-NEXT: s_waitcnt lgkmcnt(0)
577-
; GFX8910-NEXT: v_mov_b32_e32 v0, s0
578-
; GFX8910-NEXT: v_mov_b32_e32 v1, s1
579-
; GFX8910-NEXT: v_mov_b32_e32 v2, s2
580-
; GFX8910-NEXT: v_mov_b32_e32 v3, s3
581-
; GFX8910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
582-
; GFX8910-NEXT: s_endpgm
582+
; GFX8-LABEL: s_buffer_load_imm_mergex4:
583+
; GFX8: ; %bb.0: ; %main_body
584+
; GFX8-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8
585+
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
586+
; GFX8-NEXT: v_mov_b32_e32 v0, s0
587+
; GFX8-NEXT: v_mov_b32_e32 v1, s1
588+
; GFX8-NEXT: v_mov_b32_e32 v2, s2
589+
; GFX8-NEXT: v_mov_b32_e32 v3, s3
590+
; GFX8-NEXT: exp mrt0 v0, v1, v2, v3 done vm
591+
; GFX8-NEXT: s_endpgm
592+
;
593+
; GFX910-LABEL: s_buffer_load_imm_mergex4:
594+
; GFX910: ; %bb.0: ; %main_body
595+
; GFX910-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x8
596+
; GFX910-NEXT: s_waitcnt lgkmcnt(0)
597+
; GFX910-NEXT: v_mov_b32_e32 v0, s4
598+
; GFX910-NEXT: v_mov_b32_e32 v1, s5
599+
; GFX910-NEXT: v_mov_b32_e32 v2, s6
600+
; GFX910-NEXT: v_mov_b32_e32 v3, s7
601+
; GFX910-NEXT: exp mrt0 v0, v1, v2, v3 done vm
602+
; GFX910-NEXT: s_endpgm
583603
;
584604
; GFX11-LABEL: s_buffer_load_imm_mergex4:
585605
; GFX11: ; %bb.0: ; %main_body

0 commit comments

Comments
 (0)