Skip to content

[AMDGPU] Add a trap lowering workaround for gfx11 #85854

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,12 @@ def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
"MSAA loads not honoring dst_sel bug"
>;

def FeaturePrivEnabledTrap2NopBug : SubtargetFeature<"priv-enabled-trap2-nop-bug",
"HasPrivEnabledTrap2NopBug",
"true",
"Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug"
>;

class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
Expand Down Expand Up @@ -1483,13 +1489,15 @@ def FeatureISAVersion11_Generic: FeatureSet<
[FeatureMSAALoadDstSelBug,
FeatureVALUTransUseHazard,
FeatureUserSGPRInit16Bug,
FeaturePrivEnabledTrap2NopBug,
FeatureRequiresCOV6])>;

def FeatureISAVersion11_0_Common : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureMSAALoadDstSelBug,
FeatureVALUTransUseHazard,
FeatureMADIntraFwdBug])>;
FeatureMADIntraFwdBug,
FeaturePrivEnabledTrap2NopBug])>;

def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_0_Common.Features,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5376,6 +5376,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RETURN_TO_EPILOG)
NODE_NAME_CASE(ENDPGM)
NODE_NAME_CASE(ENDPGM_TRAP)
NODE_NAME_CASE(SIMULATED_TRAP)
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(SETCC)
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,9 @@ enum NodeType : unsigned {
// s_endpgm, but we may want to insert it in the middle of the block.
ENDPGM_TRAP,

// "s_trap 2" equivalent on hardware that does not support it.
SIMULATED_TRAP,

// Return to a shader part's epilog code.
RETURN_TO_EPILOG,

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,8 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone,
[SDNPHasChain]>;
def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone,
[SDNPHasChain]>;

def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
Expand Down
14 changes: 12 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6724,8 +6724,18 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
return true;
}

bool AMDGPULegalizerInfo::legalizeTrapHsa(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
// We need to simulate the 's_trap 2' instruction on targets that run in
// PRIV=1 (where it is treated as a nop).
if (ST.hasPrivEnabledTrap2NopBug()) {
ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
MI.getDebugLoc());
MI.eraseFromParent();
return true;
}

B.buildInstr(AMDGPU::S_TRAP)
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
MI.eraseFromParent();
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasImageStoreD16Bug = false;
bool HasImageGather4D16Bug = false;
bool HasMSAALoadDstSelBug = false;
bool HasPrivEnabledTrap2NopBug = false;
bool Has1_5xVGPRs = false;
bool HasMADIntraFwdBug = false;
bool HasVOPDInsts = false;
Expand Down Expand Up @@ -1026,6 +1027,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }

bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }

bool hasNSAEncoding() const { return HasNSAEncoding; }

bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5409,6 +5409,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return SplitBB;
}
case AMDGPU::SIMULATED_TRAP: {
assert(Subtarget->hasPrivEnabledTrap2NopBug());
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
MachineBasicBlock *SplitBB =
TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
MI.eraseFromParent();
return SplitBB;
}
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())
Expand Down Expand Up @@ -6627,6 +6635,11 @@ SDValue SITargetLowering::lowerTrapHsa(
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);

// We need to simulate the 's_trap 2' instruction on targets that run in
// PRIV=1 (where it is treated as a nop).
if (Subtarget->hasPrivEnabledTrap2NopBug())
return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);

uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
SDValue Ops[] = {
Chain,
Expand Down
51 changes: 51 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2026,6 +2026,57 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
}
}

MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
MachineBasicBlock &MBB,
MachineInstr &MI,
const DebugLoc &DL) const {
MachineFunction *MF = MBB.getParent();
MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock();
MF->push_back(HaltLoop);

constexpr unsigned DoorbellIDMask = 0x3ff;
constexpr unsigned ECQueueWaveAbort = 0x400;

// Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
// will be a nop.
BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
.addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
.addUse(AMDGPU::M0);
Register DoorbellRegMasked =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
.addUse(DoorbellReg)
.addImm(DoorbellIDMask);
Register SetWaveAbortBit =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
.addUse(DoorbellRegMasked)
.addImm(ECQueueWaveAbort);
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addUse(SetWaveAbortBit);
BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
.addImm(AMDGPU::SendMsg::ID_INTERRUPT);
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addUse(AMDGPU::TTMP2);
BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);

BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
.addMBB(HaltLoop);

if (SplitBB != &MBB)
MBB.removeSuccessor(SplitBB);
MBB.addSuccessor(HaltLoop);
HaltLoop->addSuccessor(HaltLoop);

return SplitBB;
}

unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned Quantity) const override;

void insertReturn(MachineBasicBlock &MBB) const;

/// Build instructions that simulate the behavior of a `s_trap 2` instructions
/// for hardware (namely, gfx11) that runs in PRIV=1 mode. There, s_trap is
/// interpreted as a nop.
MachineBasicBlock *insertSimulatedTrap(MachineRegisterInfo &MRI,
MachineBasicBlock &MBB,
MachineInstr &MI,
const DebugLoc &DL) const;

/// Return the number of wait states that result from executing this
/// instruction.
static unsigned getNumWaitStates(const MachineInstr &MI);
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ def ENDPGM_TRAP : SPseudoInstSI<
let usesCustomInserter = 1;
}

def SIMULATED_TRAP : SPseudoInstSI<(outs), (ins), [(AMDGPUsimulated_trap)],
"SIMULATED_TRAP"> {
let hasSideEffects = 1;
let usesCustomInserter = 1;
}

def ATOMIC_FENCE : SPseudoInstSI<
(outs), (ins i32imm:$ordering, i32imm:$scope),
[(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
Expand Down
49 changes: 49 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s

---
name: test_trap
body: |
bb.0:
; GFX1100-LABEL: name: test_trap
; GFX1100: successors: %bb.2(0x80000000)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
; GFX1100-NEXT: S_TRAP 2
; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
; GFX1100-NEXT: S_BRANCH %bb.2
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: .1:
; GFX1100-NEXT: successors:
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: .2:
; GFX1100-NEXT: successors: %bb.2(0x80000000)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: S_SETHALT 5
; GFX1100-NEXT: S_BRANCH %bb.2
;
; GFX1150-LABEL: name: test_trap
; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
; GFX1150-NEXT: S_TRAP 2
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
%0:_(s8) = G_CONSTANT i8 0
%1:_(p1) = G_CONSTANT i64 0
G_STORE %0, %1 :: (store 1, addrspace 1)
G_TRAP
G_STORE %0, %1 :: (store 1, addrspace 1)

...
Loading