Skip to content

[AMDGPU] Fix broken MIR generated by gfx11 simulated trap lowering #91652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 29 additions & 22 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2033,50 +2033,57 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
MachineInstr &MI,
const DebugLoc &DL) const {
MachineFunction *MF = MBB.getParent();
MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock();
MF->push_back(HaltLoop);

constexpr unsigned DoorbellIDMask = 0x3ff;
constexpr unsigned ECQueueWaveAbort = 0x400;

MachineBasicBlock *TrapBB = &MBB;
MachineBasicBlock *ContBB = &MBB;
MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();

if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
TrapBB = MF->CreateMachineBasicBlock();
BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
MF->push_back(TrapBB);
MBB.addSuccessor(TrapBB);
}

// Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
// will be a nop.
BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
DoorbellReg)
.addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
.addUse(AMDGPU::M0);
Register DoorbellRegMasked =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
.addUse(DoorbellReg)
.addImm(DoorbellIDMask);
Register SetWaveAbortBit =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
.addUse(DoorbellRegMasked)
.addImm(ECQueueWaveAbort);
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addUse(SetWaveAbortBit);
BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
.addImm(AMDGPU::SendMsg::ID_INTERRUPT);
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addUse(AMDGPU::TTMP2);
BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);

BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
.addMBB(HaltLoop);
BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
TrapBB->addSuccessor(HaltLoopBB);

if (SplitBB != &MBB)
MBB.removeSuccessor(SplitBB);
MBB.addSuccessor(HaltLoop);
HaltLoop->addSuccessor(HaltLoop);
BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
.addMBB(HaltLoopBB);
MF->push_back(HaltLoopBB);
HaltLoopBB->addSuccessor(HaltLoopBB);

return SplitBB;
return ContBB;
}

unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
Expand Down
89 changes: 76 additions & 13 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1150 %s

---
name: test_trap
body: |
bb.0:
; GFX1100-LABEL: name: test_trap
; GFX1100: successors: %bb.2(0x80000000)
; GFX1100: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: .1:
; GFX1100-NEXT: successors:
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: .2:
; GFX1100-NEXT: successors: %bb.3(0x80000000)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: S_TRAP 2
; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
Expand All @@ -21,18 +31,13 @@ body: |
; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
; GFX1100-NEXT: S_BRANCH %bb.2
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: .1:
; GFX1100-NEXT: successors:
; GFX1100-NEXT: S_BRANCH %bb.3
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: .2:
; GFX1100-NEXT: successors: %bb.2(0x80000000)
; GFX1100-NEXT: .3:
; GFX1100-NEXT: successors: %bb.3(0x80000000)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: S_SETHALT 5
; GFX1100-NEXT: S_BRANCH %bb.2
; GFX1100-NEXT: S_BRANCH %bb.3
;
; GFX1150-LABEL: name: test_trap
; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
Expand All @@ -45,5 +50,63 @@ body: |
G_STORE %0, %1 :: (store 1, addrspace 1)
G_TRAP
G_STORE %0, %1 :: (store 1, addrspace 1)
...

---
name: test_fallthrough_trap
body: |
; GFX1100-LABEL: name: test_fallthrough_trap
; GFX1100: bb.0:
; GFX1100-NEXT: successors: %bb.1(0x80000000), %bb.2(0x00000000)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: bb.1:
; GFX1100-NEXT: successors:
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: bb.2:
; GFX1100-NEXT: successors: %bb.3(0x80000000)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: S_TRAP 2
; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
; GFX1100-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
; GFX1100-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
; GFX1100-NEXT: S_BRANCH %bb.3
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: bb.3:
; GFX1100-NEXT: successors: %bb.3(0x80000000)
; GFX1100-NEXT: {{ $}}
; GFX1100-NEXT: S_SETHALT 5
; GFX1100-NEXT: S_BRANCH %bb.3
;
; GFX1150-LABEL: name: test_fallthrough_trap
; GFX1150: bb.0:
; GFX1150-NEXT: successors: %bb.1(0x80000000)
; GFX1150-NEXT: {{ $}}
; GFX1150-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX1150-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
; GFX1150-NEXT: S_TRAP 2
; GFX1150-NEXT: {{ $}}
; GFX1150-NEXT: bb.1:
; GFX1150-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
bb.0:
successors: %bb.1

%0:_(s8) = G_CONSTANT i8 0
%1:_(p1) = G_CONSTANT i64 0
G_STORE %0, %1 :: (store 1, addrspace 1)
G_TRAP

bb.1:
G_STORE %0, %1 :: (store 1, addrspace 1)
...
150 changes: 150 additions & 0 deletions llvm/test/CodeGen/AMDGPU/trap-abis.ll
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,142 @@ ret:
ret void
}

define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after:
; NOHSA-TRAP-GFX900: ; %bb.0:
; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_cbranch_execnz .LBB2_2
; NOHSA-TRAP-GFX900-NEXT: ; %bb.1:
; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: .LBB2_2:
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
;
; HSA-TRAP-GFX803-LABEL: trap_with_use_after:
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5]
; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5
; HSA-TRAP-GFX803-NEXT: flat_load_dword v2, v[0:1] glc
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s6
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s7
; HSA-TRAP-GFX803-NEXT: s_trap 2
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX803-NEXT: s_endpgm
;
; HSA-TRAP-GFX900-LABEL: trap_with_use_after:
; HSA-TRAP-GFX900: ; %bb.0:
; HSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX900-NEXT: s_trap 2
; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX900-NEXT: s_endpgm
;
; HSA-NOTRAP-GFX900-LABEL: trap_with_use_after:
; HSA-NOTRAP-GFX900: ; %bb.0:
; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; HSA-NOTRAP-GFX900-NEXT: s_cbranch_execnz .LBB2_2
; HSA-NOTRAP-GFX900-NEXT: ; %bb.1:
; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; HSA-NOTRAP-GFX900-NEXT: .LBB2_2:
; HSA-NOTRAP-GFX900-NEXT: s_endpgm
;
; HSA-TRAP-GFX1100-LABEL: trap_with_use_after:
; HSA-TRAP-GFX1100: ; %bb.0:
; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0
; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; HSA-TRAP-GFX1100-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX1100-NEXT: s_cbranch_execnz .LBB2_2
; HSA-TRAP-GFX1100-NEXT: ; %bb.1:
; HSA-TRAP-GFX1100-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; HSA-TRAP-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; HSA-TRAP-GFX1100-NEXT: s_nop 0
; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; HSA-TRAP-GFX1100-NEXT: s_endpgm
; HSA-TRAP-GFX1100-NEXT: .LBB2_2:
; HSA-TRAP-GFX1100-NEXT: s_trap 2
; HSA-TRAP-GFX1100-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0
; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX1100-NEXT: s_and_b32 s0, s0, 0x3ff
; HSA-TRAP-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; HSA-TRAP-GFX1100-NEXT: s_bitset1_b32 s0, 10
; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, s0
; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT)
; HSA-TRAP-GFX1100-NEXT: s_mov_b32 m0, ttmp2
; HSA-TRAP-GFX1100-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1
; HSA-TRAP-GFX1100-NEXT: s_sethalt 5
; HSA-TRAP-GFX1100-NEXT: s_branch .LBB2_3
;
; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after:
; HSA-TRAP-GFX1100-O0: ; %bb.0:
; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0
; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off offset:8 ; 4-byte Folded Spill
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v1, s2, 0
; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v1, s3, 1
; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1
; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v1, off offset:4 ; 4-byte Folded Spill
; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6
; HSA-TRAP-GFX1100-O0-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off ; 4-byte Folded Spill
; HSA-TRAP-GFX1100-O0-NEXT: s_cbranch_execnz .LBB2_2
; HSA-TRAP-GFX1100-O0-NEXT: ; %bb.1:
; HSA-TRAP-GFX1100-O0-NEXT: s_or_saveexec_b32 s6, -1
; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload
; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 exec_lo, s6
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s0, v0, 0
; HSA-TRAP-GFX1100-O0-NEXT: v_readlane_b32 s1, v0, 1
; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v1, off, off offset:8 ; 4-byte Folded Reload
; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v2, off, off ; 4-byte Folded Reload
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v1, v2, s[0:1] dlc
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0
; HSA-TRAP-GFX1100-O0-NEXT: ; kill: killed $vgpr0
; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm
; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_2:
; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2
; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 ttmp2, m0
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 s0, s0, 0x3ff
; HSA-TRAP-GFX1100-O0-NEXT: s_or_b32 s0, s0, 0x400
; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, s0
; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT)
; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 m0, ttmp2
; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1
; HSA-TRAP-GFX1100-O0-NEXT: s_sethalt 5
; HSA-TRAP-GFX1100-O0-NEXT: s_branch .LBB2_3
%tmp = load volatile i32, ptr addrspace(1) %arg0
call void @llvm.trap()
store volatile i32 %tmp, ptr addrspace(1) %arg1
ret void
}

define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-LABEL: debugtrap:
; NOHSA-TRAP-GFX900: ; %bb.0:
Expand Down Expand Up @@ -334,6 +470,20 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
; HSA-TRAP-GFX1100-NEXT: s_nop 0
; HSA-TRAP-GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; HSA-TRAP-GFX1100-NEXT: s_endpgm
;
; HSA-TRAP-GFX1100-O0-LABEL: debugtrap:
; HSA-TRAP-GFX1100-O0: ; %bb.0:
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0
; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0
; HSA-TRAP-GFX1100-O0-NEXT: s_trap 3
; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 2
; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0
; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm
store volatile i32 1, ptr addrspace(1) %arg0
call void @llvm.debugtrap()
store volatile i32 2, ptr addrspace(1) %arg0
Expand Down