Skip to content

Commit a047147

Browse files
authored
[AMDGPU] Add a trap lowering workaround for gfx11 (#85854)
On gfx11 shaders run with PRIV=1, which causes `s_trap 2` to be treated as a nop, which means it isn't a correct lowering for the trap intrinsic. As a workaround, this commit instead lowers the trap intrinsic to instructions that simulate the behavior of s_trap 2. Fixes: SWDEV-438421
1 parent e58dcf1 commit a047147

File tree

12 files changed

+295
-3
lines changed

12 files changed

+295
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,12 @@ def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
307307
"MSAA loads not honoring dst_sel bug"
308308
>;
309309

310+
def FeaturePrivEnabledTrap2NopBug : SubtargetFeature<"priv-enabled-trap2-nop-bug",
311+
"HasPrivEnabledTrap2NopBug",
312+
"true",
313+
"Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug"
314+
>;
315+
310316
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
311317
"ldsbankcount"#Value,
312318
"LDSBankCount",
@@ -1487,13 +1493,15 @@ def FeatureISAVersion11_Generic: FeatureSet<
14871493
[FeatureMSAALoadDstSelBug,
14881494
FeatureVALUTransUseHazard,
14891495
FeatureUserSGPRInit16Bug,
1496+
FeaturePrivEnabledTrap2NopBug,
14901497
FeatureRequiresCOV6])>;
14911498

14921499
def FeatureISAVersion11_0_Common : FeatureSet<
14931500
!listconcat(FeatureISAVersion11_Common.Features,
14941501
[FeatureMSAALoadDstSelBug,
14951502
FeatureVALUTransUseHazard,
1496-
FeatureMADIntraFwdBug])>;
1503+
FeatureMADIntraFwdBug,
1504+
FeaturePrivEnabledTrap2NopBug])>;
14971505

14981506
def FeatureISAVersion11_0_0 : FeatureSet<
14991507
!listconcat(FeatureISAVersion11_0_Common.Features,

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5377,6 +5377,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
53775377
NODE_NAME_CASE(RETURN_TO_EPILOG)
53785378
NODE_NAME_CASE(ENDPGM)
53795379
NODE_NAME_CASE(ENDPGM_TRAP)
5380+
NODE_NAME_CASE(SIMULATED_TRAP)
53805381
NODE_NAME_CASE(DWORDADDR)
53815382
NODE_NAME_CASE(FRACT)
53825383
NODE_NAME_CASE(SETCC)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,9 @@ enum NodeType : unsigned {
407407
// s_endpgm, but we may want to insert it in the middle of the block.
408408
ENDPGM_TRAP,
409409

410+
// "s_trap 2" equivalent on hardware that does not support it.
411+
SIMULATED_TRAP,
412+
410413
// Return to a shader part's epilog code.
411414
RETURN_TO_EPILOG,
412415

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,8 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
377377
[SDNPHasChain, SDNPOptInGlue]>;
378378
def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone,
379379
[SDNPHasChain]>;
380+
def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone,
381+
[SDNPHasChain]>;
380382

381383
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
382384
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6725,8 +6725,18 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
67256725
return true;
67266726
}
67276727

6728-
bool AMDGPULegalizerInfo::legalizeTrapHsa(
6729-
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6728+
bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6729+
MachineRegisterInfo &MRI,
6730+
MachineIRBuilder &B) const {
6731+
// We need to simulate the 's_trap 2' instruction on targets that run in
6732+
// PRIV=1 (where it is treated as a nop).
6733+
if (ST.hasPrivEnabledTrap2NopBug()) {
6734+
ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6735+
MI.getDebugLoc());
6736+
MI.eraseFromParent();
6737+
return true;
6738+
}
6739+
67306740
B.buildInstr(AMDGPU::S_TRAP)
67316741
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
67326742
MI.eraseFromParent();

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
224224
bool HasImageStoreD16Bug = false;
225225
bool HasImageGather4D16Bug = false;
226226
bool HasMSAALoadDstSelBug = false;
227+
bool HasPrivEnabledTrap2NopBug = false;
227228
bool Has1_5xVGPRs = false;
228229
bool HasMADIntraFwdBug = false;
229230
bool HasVOPDInsts = false;
@@ -1032,6 +1033,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
10321033

10331034
bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
10341035

1036+
bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1037+
10351038
bool hasNSAEncoding() const { return HasNSAEncoding; }
10361039

10371040
bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5405,6 +5405,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
54055405
MI.eraseFromParent();
54065406
return SplitBB;
54075407
}
5408+
case AMDGPU::SIMULATED_TRAP: {
5409+
assert(Subtarget->hasPrivEnabledTrap2NopBug());
5410+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5411+
MachineBasicBlock *SplitBB =
5412+
TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5413+
MI.eraseFromParent();
5414+
return SplitBB;
5415+
}
54085416
default:
54095417
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
54105418
if (!MI.mayStore())
@@ -6623,6 +6631,11 @@ SDValue SITargetLowering::lowerTrapHsa(
66236631
SDLoc SL(Op);
66246632
SDValue Chain = Op.getOperand(0);
66256633

6634+
// We need to simulate the 's_trap 2' instruction on targets that run in
6635+
// PRIV=1 (where it is treated as a nop).
6636+
if (Subtarget->hasPrivEnabledTrap2NopBug())
6637+
return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6638+
66266639
uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
66276640
SDValue Ops[] = {
66286641
Chain,

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2026,6 +2026,57 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
20262026
}
20272027
}
20282028

2029+
MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
2030+
MachineBasicBlock &MBB,
2031+
MachineInstr &MI,
2032+
const DebugLoc &DL) const {
2033+
MachineFunction *MF = MBB.getParent();
2034+
MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2035+
MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock();
2036+
MF->push_back(HaltLoop);
2037+
2038+
constexpr unsigned DoorbellIDMask = 0x3ff;
2039+
constexpr unsigned ECQueueWaveAbort = 0x400;
2040+
2041+
// Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2042+
// will be a nop.
2043+
BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
2044+
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2045+
Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2046+
BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
2047+
.addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
2048+
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2049+
.addUse(AMDGPU::M0);
2050+
Register DoorbellRegMasked =
2051+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2052+
BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2053+
.addUse(DoorbellReg)
2054+
.addImm(DoorbellIDMask);
2055+
Register SetWaveAbortBit =
2056+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2057+
BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2058+
.addUse(DoorbellRegMasked)
2059+
.addImm(ECQueueWaveAbort);
2060+
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2061+
.addUse(SetWaveAbortBit);
2062+
BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
2063+
.addImm(AMDGPU::SendMsg::ID_INTERRUPT);
2064+
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2065+
.addUse(AMDGPU::TTMP2);
2066+
BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);
2067+
2068+
BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2069+
BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
2070+
.addMBB(HaltLoop);
2071+
2072+
if (SplitBB != &MBB)
2073+
MBB.removeSuccessor(SplitBB);
2074+
MBB.addSuccessor(HaltLoop);
2075+
HaltLoop->addSuccessor(HaltLoop);
2076+
2077+
return SplitBB;
2078+
}
2079+
20292080
unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
20302081
switch (MI.getOpcode()) {
20312082
default:

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
11941194
unsigned Quantity) const override;
11951195

11961196
void insertReturn(MachineBasicBlock &MBB) const;
1197+
1198+
/// Build instructions that simulate the behavior of a `s_trap 2` instructions
1199+
/// for hardware (namely, gfx11) that runs in PRIV=1 mode. There, s_trap is
1200+
/// interpreted as a nop.
1201+
MachineBasicBlock *insertSimulatedTrap(MachineRegisterInfo &MRI,
1202+
MachineBasicBlock &MBB,
1203+
MachineInstr &MI,
1204+
const DebugLoc &DL) const;
1205+
11971206
/// Return the number of wait states that result from executing this
11981207
/// instruction.
11991208
static unsigned getNumWaitStates(const MachineInstr &MI);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,12 @@ def ENDPGM_TRAP : SPseudoInstSI<
106106
let usesCustomInserter = 1;
107107
}
108108

109+
def SIMULATED_TRAP : SPseudoInstSI<(outs), (ins), [(AMDGPUsimulated_trap)],
110+
"SIMULATED_TRAP"> {
111+
let hasSideEffects = 1;
112+
let usesCustomInserter = 1;
113+
}
114+
109115
def ATOMIC_FENCE : SPseudoInstSI<
110116
(outs), (ins i32imm:$ordering, i32imm:$scope),
111117
[(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
2+
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
3+
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
4+
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s
5+
6+
---
7+
name: test_trap
8+
body: |
9+
bb.0:
10+
; GFX1100-LABEL: name: test_trap
11+
; GFX1100: successors: %bb.2(0x80000000)
12+
; GFX1100-NEXT: {{ $}}
13+
; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
14+
; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
15+
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
16+
; GFX1100-NEXT: S_TRAP 2
17+
; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
18+
; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
19+
; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
20+
; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
21+
; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
22+
; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
23+
; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
24+
; GFX1100-NEXT: S_BRANCH %bb.2
25+
; GFX1100-NEXT: {{ $}}
26+
; GFX1100-NEXT: .1:
27+
; GFX1100-NEXT: successors:
28+
; GFX1100-NEXT: {{ $}}
29+
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
30+
; GFX1100-NEXT: {{ $}}
31+
; GFX1100-NEXT: .2:
32+
; GFX1100-NEXT: successors: %bb.2(0x80000000)
33+
; GFX1100-NEXT: {{ $}}
34+
; GFX1100-NEXT: S_SETHALT 5
35+
; GFX1100-NEXT: S_BRANCH %bb.2
36+
;
37+
; GFX1150-LABEL: name: test_trap
38+
; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
39+
; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
40+
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
41+
; GFX1150-NEXT: S_TRAP 2
42+
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
43+
%0:_(s8) = G_CONSTANT i8 0
44+
%1:_(p1) = G_CONSTANT i64 0
45+
G_STORE %0, %1 :: (store 1, addrspace 1)
46+
G_TRAP
47+
G_STORE %0, %1 :: (store 1, addrspace 1)
48+
49+
...

0 commit comments

Comments
 (0)