Skip to content

[AMDGPU] Merge consecutive wait_alu instruction #128916

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 46 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,46 @@ class AMDGPUWaitSGPRHazards {
BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP));
}

unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
unsigned Mask = 0xffff;
Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVaVcc(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(Mask1),
AMDGPU::DepCtr::decodeFieldVaVcc(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVmVsrc(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVmVsrc(Mask1),
AMDGPU::DepCtr::decodeFieldVmVsrc(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVaSdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(Mask1),
AMDGPU::DepCtr::decodeFieldVaSdst(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVaVdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1),
AMDGPU::DepCtr::decodeFieldVaVdst(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldHoldCnt(
Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1),
AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVaSsrc(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1),
AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2)));
return Mask;
}

bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI,
unsigned Mask) {
auto MBB = MI->getParent();
if (MI == MBB->instr_begin())
return false;

auto It = prev_nodbg(MI, MBB->instr_begin());
if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)
return false;

It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));
return true;
}

bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };

Expand Down Expand Up @@ -362,10 +402,12 @@ class AMDGPUWaitSGPRHazards {
Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0);
}
if (Emit) {
auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(Mask);
updateGetPCBundle(NewMI);
if (!mergeConsecutiveWaitAlus(MI, Mask)) {
auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(Mask);
updateGetPCBundle(NewMI);
}
Emitted = true;
}
}
Expand Down
36 changes: 36 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,18 @@ inline unsigned getSaSdstBitWidth() { return 1; }
/// \returns SaSdst bit shift
inline unsigned getSaSdstBitShift() { return 0; }

/// \returns VaSsrc width
inline unsigned getVaSsrcBitWidth() { return 1; }

/// \returns VaSsrc bit shift
inline unsigned getVaSsrcBitShift() { return 8; }

/// \returns HoldCnt bit shift
inline unsigned getHoldCntWidth() { return 1; }

/// \returns HoldCnt bit shift
inline unsigned getHoldCntBitShift() { return 7; }

} // end anonymous namespace

namespace llvm {
Expand Down Expand Up @@ -1740,6 +1752,14 @@ unsigned decodeFieldVaVcc(unsigned Encoded) {
return unpackBits(Encoded, getVaVccBitShift(), getVaVccBitWidth());
}

unsigned decodeFieldVaSsrc(unsigned Encoded) {
return unpackBits(Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth());
}

unsigned decodeFieldHoldCnt(unsigned Encoded) {
return unpackBits(Encoded, getHoldCntBitShift(), getHoldCntWidth());
}

unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) {
return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
}
Expand Down Expand Up @@ -1780,6 +1800,22 @@ unsigned encodeFieldVaVcc(unsigned VaVcc) {
return encodeFieldVaVcc(0xffff, VaVcc);
}

unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc) {
return packBits(VaSsrc, Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth());
}

unsigned encodeFieldVaSsrc(unsigned VaSsrc) {
return encodeFieldVaSsrc(0xffff, VaSsrc);
}

unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt) {
return packBits(HoldCnt, Encoded, getHoldCntBitShift(), getHoldCntWidth());
}

unsigned encodeFieldHoldCnt(unsigned HoldCnt) {
return encodeFieldHoldCnt(0xffff, HoldCnt);
}

} // namespace DepCtr

//===----------------------------------------------------------------------===//
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,12 @@ unsigned decodeFieldVaSdst(unsigned Encoded);
/// \returns Decoded VaVcc from given immediate \p Encoded.
unsigned decodeFieldVaVcc(unsigned Encoded);

/// \returns Decoded SaSrc from given immediate \p Encoded.
unsigned decodeFieldVaSsrc(unsigned Encoded);

/// \returns Decoded HoldCnt from given immediate \p Encoded.
unsigned decodeFieldHoldCnt(unsigned Encoded);

/// \returns \p VmVsrc as an encoded Depctr immediate.
unsigned encodeFieldVmVsrc(unsigned VmVsrc);

Expand Down Expand Up @@ -1210,6 +1216,18 @@ unsigned encodeFieldVaVcc(unsigned VaVcc);
/// \returns \p Encoded combined with encoded \p VaVcc.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc);

/// \returns \p HoldCnt as an encoded Depctr immediate.
unsigned encodeFieldHoldCnt(unsigned HoldCnt);

/// \returns \p Encoded combined with encoded \p HoldCnt.
unsigned encodeFieldHoldCnt(unsigned HoldCnt, unsigned Encoded);

/// \returns \p VaSsrc as an encoded Depctr immediate.
unsigned encodeFieldVaSsrc(unsigned VaSsrc);

/// \returns \p Encoded combined with encoded \p VaSsrc.
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc);

} // namespace DepCtr

namespace Exp {
Expand Down
79 changes: 79 additions & 0 deletions llvm/test/CodeGen/AMDGPU/merge-consecutive-wait-alus.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass amdgpu-wait-sgpr-hazards -o - %s | FileCheck %s


---
name: merge_consecutive_wait_alus
body: |
bb.0:
liveins: $vgpr0

; CHECK-LABEL: name: merge_consecutive_wait_alus
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc_lo
; CHECK-NEXT: S_WAITCNT_DEPCTR 61946
; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc_lo
renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc
S_WAITCNT_DEPCTR 65530
renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc
...
---
name: merge_consecutive_wait_alus_two_bb
body: |
; CHECK-LABEL: name: merge_consecutive_wait_alus_two_bb
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc_lo
; CHECK-NEXT: S_WAITCNT_DEPCTR 65530
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAITCNT_DEPCTR 61951
; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc_lo
bb.0:
liveins: $vgpr0

renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc
S_WAITCNT_DEPCTR 65530

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test where it's the start of the block and end of the block

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also add a test that skips meta instructions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test where it's the start of the block and end of the block

Do you mean same basic block or successor and predecessor ?

bb.1:
liveins: $sgpr0

renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc
...
---
name: meta_instructions
machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: meta_instructions
; CHECK: renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc_lo
; CHECK-NEXT: S_WAITCNT_DEPCTR 65530
; CHECK-NEXT: SCHED_BARRIER 0
; CHECK-NEXT: S_WAITCNT_DEPCTR 61951
; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc_lo
renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc
S_WAITCNT_DEPCTR 65530
SCHED_BARRIER 0
renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc
...
---
name: debug_instruction
machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: debug_instruction
; CHECK: renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc_lo
; CHECK-NEXT: S_WAITCNT_DEPCTR 61946
; CHECK-NEXT: DBG_VALUE $sgpr0
; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc_lo
renamable $sgpr0 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec, implicit-def $vcc_lo, implicit-def $vcc
S_WAITCNT_DEPCTR 65530
DBG_VALUE $sgpr0
renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr0, killed $sgpr0, implicit $exec, implicit-def $vcc
...