Skip to content

Commit 4c67acc

Browse files
committed
[AMDGPU] Add a trap lowering workaround for gfx11 (llvm#85854)
On gfx11 shaders run with PRIV=1, which causes `s_trap 2` to be treated as a nop, which means it isn't a correct lowering for the trap intrinsic. As a workaround, this commit instead lowers the trap intrinsic to instructions that simulate the behavior of s_trap 2. This commit also includes the following commit, fixing a bug in the original version: > 9e0be65 > [AMDGPU] Fix broken MIR generated by gfx11 simulated trap lowering (llvm#91652) > This was breaking the CFG connection between uses of virtual registers > after the trap and their definitions before it. Fixes SWDEV-460384. > Fixes a bug in llvm#85854. Fixes: SWDEV-438421 Change-Id: I58244d69370abe3c57f5239f196cb284a5407530
1 parent 42fb59b commit 4c67acc

File tree

12 files changed

+512
-2
lines changed

12 files changed

+512
-2
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,12 @@ def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
306306
"MSAA loads not honoring dst_sel bug"
307307
>;
308308

309+
def FeaturePrivEnabledTrap2NopBug : SubtargetFeature<"priv-enabled-trap2-nop-bug",
310+
"HasPrivEnabledTrap2NopBug",
311+
"true",
312+
"Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug"
313+
>;
314+
309315
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
310316
"ldsbankcount"#Value,
311317
"LDSBankCount",
@@ -1473,6 +1479,7 @@ def FeatureISAVersion11_Common : FeatureSet<
14731479
def FeatureISAVersion11_0_Common : FeatureSet<
14741480
!listconcat(FeatureISAVersion11_Common.Features,
14751481
[FeatureMSAALoadDstSelBug,
1482+
FeaturePrivEnabledTrap2NopBug,
14761483
FeatureVALUTransUseHazard])>;
14771484

14781485
def FeatureISAVersion11_0_0 : FeatureSet<

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5360,6 +5360,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
53605360
NODE_NAME_CASE(RETURN_TO_EPILOG)
53615361
NODE_NAME_CASE(ENDPGM)
53625362
NODE_NAME_CASE(ENDPGM_TRAP)
5363+
NODE_NAME_CASE(SIMULATED_TRAP)
53635364
NODE_NAME_CASE(DWORDADDR)
53645365
NODE_NAME_CASE(FRACT)
53655366
NODE_NAME_CASE(SETCC)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,9 @@ enum NodeType : unsigned {
405405
// s_endpgm, but we may want to insert it in the middle of the block.
406406
ENDPGM_TRAP,
407407

408+
// "s_trap 2" equivalent on hardware that does not support it.
409+
SIMULATED_TRAP,
410+
408411
// Return to a shader part's epilog code.
409412
RETURN_TO_EPILOG,
410413

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,8 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
377377
[SDNPHasChain, SDNPOptInGlue]>;
378378
def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone,
379379
[SDNPHasChain]>;
380+
def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone,
381+
[SDNPHasChain]>;
380382

381383
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
382384
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6670,8 +6670,18 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
66706670
return true;
66716671
}
66726672

6673-
bool AMDGPULegalizerInfo::legalizeTrapHsa(
6674-
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6673+
bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6674+
MachineRegisterInfo &MRI,
6675+
MachineIRBuilder &B) const {
6676+
// We need to simulate the 's_trap 2' instruction on targets that run in
6677+
// PRIV=1 (where it is treated as a nop).
6678+
if (ST.hasPrivEnabledTrap2NopBug()) {
6679+
ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6680+
MI.getDebugLoc());
6681+
MI.eraseFromParent();
6682+
return true;
6683+
}
6684+
66756685
B.buildInstr(AMDGPU::S_TRAP)
66766686
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
66776687
MI.eraseFromParent();

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
224224
bool HasImageGather4D16Bug = false;
225225
bool HasMSAALoadDstSelBug = false;
226226
bool HasGFX11FullVGPRs = false;
227+
bool HasPrivEnabledTrap2NopBug = false;
227228
bool HasMADIntraFwdBug = false;
228229
bool HasVOPDInsts = false;
229230
bool HasVALUTransUseHazard = false;
@@ -1010,6 +1011,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
10101011

10111012
bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
10121013

1014+
bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1015+
10131016
bool hasNSAEncoding() const { return HasNSAEncoding; }
10141017

10151018
bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5291,6 +5291,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
52915291
MI.eraseFromParent();
52925292
return SplitBB;
52935293
}
5294+
case AMDGPU::SIMULATED_TRAP: {
5295+
assert(Subtarget->hasPrivEnabledTrap2NopBug());
5296+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5297+
MachineBasicBlock *SplitBB =
5298+
TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5299+
MI.eraseFromParent();
5300+
return SplitBB;
5301+
}
52945302
default:
52955303
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
52965304
}
@@ -6513,6 +6521,11 @@ SDValue SITargetLowering::lowerTrapHsa(
65136521
SDLoc SL(Op);
65146522
SDValue Chain = Op.getOperand(0);
65156523

6524+
// We need to simulate the 's_trap 2' instruction on targets that run in
6525+
// PRIV=1 (where it is treated as a nop).
6526+
if (Subtarget->hasPrivEnabledTrap2NopBug())
6527+
return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6528+
65166529
uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
65176530
SDValue Ops[] = {
65186531
Chain,

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2077,6 +2077,64 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
20772077
}
20782078
}
20792079

2080+
MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
2081+
MachineBasicBlock &MBB,
2082+
MachineInstr &MI,
2083+
const DebugLoc &DL) const {
2084+
MachineFunction *MF = MBB.getParent();
2085+
constexpr unsigned DoorbellIDMask = 0x3ff;
2086+
constexpr unsigned ECQueueWaveAbort = 0x400;
2087+
2088+
MachineBasicBlock *TrapBB = &MBB;
2089+
MachineBasicBlock *ContBB = &MBB;
2090+
MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2091+
2092+
if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2093+
ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2094+
TrapBB = MF->CreateMachineBasicBlock();
2095+
BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2096+
MF->push_back(TrapBB);
2097+
MBB.addSuccessor(TrapBB);
2098+
}
2099+
2100+
// Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2101+
// will be a nop.
2102+
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2103+
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2104+
Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2105+
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2106+
DoorbellReg)
2107+
.addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
2108+
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2109+
.addUse(AMDGPU::M0);
2110+
Register DoorbellRegMasked =
2111+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2112+
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2113+
.addUse(DoorbellReg)
2114+
.addImm(DoorbellIDMask);
2115+
Register SetWaveAbortBit =
2116+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2117+
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2118+
.addUse(DoorbellRegMasked)
2119+
.addImm(ECQueueWaveAbort);
2120+
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2121+
.addUse(SetWaveAbortBit);
2122+
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2123+
.addImm(AMDGPU::SendMsg::ID_INTERRUPT);
2124+
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2125+
.addUse(AMDGPU::TTMP2);
2126+
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2127+
TrapBB->addSuccessor(HaltLoopBB);
2128+
2129+
BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2130+
BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2131+
.addMBB(HaltLoopBB);
2132+
MF->push_back(HaltLoopBB);
2133+
HaltLoopBB->addSuccessor(HaltLoopBB);
2134+
2135+
return ContBB;
2136+
}
2137+
20802138
unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
20812139
switch (MI.getOpcode()) {
20822140
default:

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,6 +1192,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
11921192
unsigned Quantity) const override;
11931193

11941194
void insertReturn(MachineBasicBlock &MBB) const;
1195+
1196+
/// Build instructions that simulate the behavior of a `s_trap 2` instructions
1197+
/// for hardware (namely, gfx11) that runs in PRIV=1 mode. There, s_trap is
1198+
/// interpreted as a nop.
1199+
MachineBasicBlock *insertSimulatedTrap(MachineRegisterInfo &MRI,
1200+
MachineBasicBlock &MBB,
1201+
MachineInstr &MI,
1202+
const DebugLoc &DL) const;
1203+
11951204
/// Return the number of wait states that result from executing this
11961205
/// instruction.
11971206
static unsigned getNumWaitStates(const MachineInstr &MI);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,12 @@ def ENDPGM_TRAP : SPseudoInstSI<
106106
let usesCustomInserter = 1;
107107
}
108108

109+
def SIMULATED_TRAP : SPseudoInstSI<(outs), (ins), [(AMDGPUsimulated_trap)],
110+
"SIMULATED_TRAP"> {
111+
let hasSideEffects = 1;
112+
let usesCustomInserter = 1;
113+
}
114+
109115
def ATOMIC_FENCE : SPseudoInstSI<
110116
(outs), (ins i32imm:$ordering, i32imm:$scope),
111117
[(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
2+
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
3+
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1150 %s
4+
5+
---
6+
name: test_trap
7+
body: |
8+
bb.0:
9+
; GFX1100-LABEL: name: test_trap
10+
; GFX1100: successors: %bb.1(0x40000000), %bb.2(0x40000000)
11+
; GFX1100-NEXT: {{ $}}
12+
; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
13+
; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
14+
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
15+
; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
16+
; GFX1100-NEXT: {{ $}}
17+
; GFX1100-NEXT: .1:
18+
; GFX1100-NEXT: successors:{{ $}}
19+
; GFX1100-NEXT: {{ $}}
20+
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
21+
; GFX1100-NEXT: {{ $}}
22+
; GFX1100-NEXT: .2:
23+
; GFX1100-NEXT: successors: %bb.3(0x80000000)
24+
; GFX1100-NEXT: {{ $}}
25+
; GFX1100-NEXT: S_TRAP 2
26+
; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
27+
; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
28+
; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
29+
; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
30+
; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
31+
; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
32+
; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
33+
; GFX1100-NEXT: S_BRANCH %bb.3
34+
; GFX1100-NEXT: {{ $}}
35+
; GFX1100-NEXT: .3:
36+
; GFX1100-NEXT: successors: %bb.3(0x80000000)
37+
; GFX1100-NEXT: {{ $}}
38+
; GFX1100-NEXT: S_SETHALT 5
39+
; GFX1100-NEXT: S_BRANCH %bb.3
40+
;
41+
; GFX1150-LABEL: name: test_trap
42+
; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
43+
; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
44+
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
45+
; GFX1150-NEXT: S_TRAP 2
46+
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
47+
%0:_(s8) = G_CONSTANT i8 0
48+
%1:_(p1) = G_CONSTANT i64 0
49+
G_STORE %0, %1 :: (store 1, addrspace 1)
50+
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap)
51+
G_STORE %0, %1 :: (store 1, addrspace 1)
52+
...
53+
54+
---
55+
name: test_fallthrough_trap
56+
body: |
57+
; GFX1100-LABEL: name: test_fallthrough_trap
58+
; GFX1100: bb.0:
59+
; GFX1100-NEXT: successors: %bb.1(0x80000000), %bb.2(0x00000000)
60+
; GFX1100-NEXT: {{ $}}
61+
; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
62+
; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
63+
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
64+
; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
65+
; GFX1100-NEXT: {{ $}}
66+
; GFX1100-NEXT: bb.1:
67+
; GFX1100-NEXT: successors:{{ $}}
68+
; GFX1100-NEXT: {{ $}}
69+
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
70+
; GFX1100-NEXT: {{ $}}
71+
; GFX1100-NEXT: bb.2:
72+
; GFX1100-NEXT: successors: %bb.3(0x80000000)
73+
; GFX1100-NEXT: {{ $}}
74+
; GFX1100-NEXT: S_TRAP 2
75+
; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
76+
; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
77+
; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
78+
; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
79+
; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
80+
; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
81+
; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
82+
; GFX1100-NEXT: S_BRANCH %bb.3
83+
; GFX1100-NEXT: {{ $}}
84+
; GFX1100-NEXT: bb.3:
85+
; GFX1100-NEXT: successors: %bb.3(0x80000000)
86+
; GFX1100-NEXT: {{ $}}
87+
; GFX1100-NEXT: S_SETHALT 5
88+
; GFX1100-NEXT: S_BRANCH %bb.3
89+
;
90+
; GFX1150-LABEL: name: test_fallthrough_trap
91+
; GFX1150: bb.0:
92+
; GFX1150-NEXT: successors: %bb.1(0x80000000)
93+
; GFX1150-NEXT: {{ $}}
94+
; GFX1150-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
95+
; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
96+
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
97+
; GFX1150-NEXT: S_TRAP 2
98+
; GFX1150-NEXT: {{ $}}
99+
; GFX1150-NEXT: bb.1:
100+
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
101+
bb.0:
102+
successors: %bb.1
103+
104+
%0:_(s8) = G_CONSTANT i8 0
105+
%1:_(p1) = G_CONSTANT i64 0
106+
G_STORE %0, %1 :: (store 1, addrspace 1)
107+
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap)
108+
109+
bb.1:
110+
G_STORE %0, %1 :: (store 1, addrspace 1)
111+
...

0 commit comments

Comments
 (0)