Skip to content

[AMDGPU] Add intrinsics and MIs for ds_bvh_stack_* #130007

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2714,18 +2714,21 @@ def int_amdgcn_ds_sub_gs_reg_rtn :
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]>;

def int_amdgcn_ds_bvh_stack_rtn :
class IntDSBVHStackRtn<LLVMType vdst, LLVMType data1> :
Intrinsic<
[llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
[vdst, llvm_i32_ty], // %vdst, %addr
[
llvm_i32_ty, // %addr
llvm_i32_ty, // %data0
llvm_v4i32_ty, // %data1
data1, // %data1
llvm_i32_ty, // %offset
],
[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;

def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty,
data1 = llvm_v4i32_ty>;

def int_amdgcn_s_wait_event_export_ready :
ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
Expand Down Expand Up @@ -2801,6 +2804,15 @@ def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty
// GFX12 Intrinsics
//===----------------------------------------------------------------------===//

def int_amdgcn_ds_bvh_stack_push4_pop1_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty,
data1 = llvm_v4i32_ty>;

def int_amdgcn_ds_bvh_stack_push8_pop1_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty,
data1 = llvm_v8i32_ty>;

def int_amdgcn_ds_bvh_stack_push8_pop2_rtn : IntDSBVHStackRtn<vdst = llvm_i64_ty,
data1 = llvm_v8i32_ty>;

// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control>
def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
Intrinsic<[llvm_i32_ty],
Expand Down
21 changes: 18 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2665,8 +2665,20 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {

// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
unsigned Opc;
switch (IntrID) {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
break;
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
break;
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
break;
}
SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
N->getOperand(5), N->getOperand(0)};

Expand Down Expand Up @@ -2830,7 +2842,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
return;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
SelectDSBvhStackIntrinsic(N);
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
SelectDSBvhStackIntrinsic(N, IntrID);
return;
case Intrinsic::amdgcn_init_whole_wave:
CurDAG->getMachineFunction()
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectFMAD_FMA(SDNode *N);
void SelectFP_EXTEND(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDSBvhStackIntrinsic(SDNode *N);
void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectInterpP1F16(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
Expand Down
19 changes: 18 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2277,7 +2277,21 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
Register Data1 = MI.getOperand(5).getReg();
unsigned Offset = MI.getOperand(6).getImm();

auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
unsigned Opc;
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
break;
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
break;
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
break;
}

auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
.addDef(Dst1)
.addUse(Addr)
.addUse(Data0)
Expand Down Expand Up @@ -2332,6 +2346,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
}
break;
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
return selectDSBvhStackIntrinsic(I);
case Intrinsic::amdgcn_s_barrier_signal_var:
return selectNamedBarrierInit(I, IntrinsicID);
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5252,7 +5252,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
OpdsMapping[0] =
getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
OpdsMapping[1] =
Expand Down
24 changes: 20 additions & 4 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -258,10 +258,13 @@ multiclass DS_1A2D_Off8_RET_mc<string opName,
}
}

class DS_BVH_STACK<string opName>
class DS_BVH_STACK<string opName,
RegisterClass vdst_rc,
RegisterClass data1_rc>
: DS_Pseudo<opName,
(outs getLdStRegisterOperand<VGPR_32>.ret:$vdst, VGPR_32:$addr),
(ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0, VReg_128:$data1, Offset:$offset),
(outs getLdStRegisterOperand<vdst_rc>.ret:$vdst, VGPR_32:$addr),
(ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0,
data1_rc:$data1, Offset:$offset),
" $vdst, $addr, $data0, $data1$offset"> {
let Constraints = "$addr = $addr_in";
let DisableEncoding = "$addr_in";
Expand Down Expand Up @@ -722,7 +725,8 @@ def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
let SubtargetPredicate = isGFX11Plus in {

let OtherPredicates = [HasImageInsts] in
def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32",
VGPR_32, VReg_128> ;

} // let SubtargetPredicate = isGFX11Plus

Expand All @@ -732,6 +736,13 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;

let SubtargetPredicate = isGFX12Plus in {

let OtherPredicates = [HasImageInsts] in {
def DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_BVH_STACK<
"ds_bvh_stack_push8_pop1_rtn_b32", VGPR_32, VReg_256>;
def DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_BVH_STACK<
"ds_bvh_stack_push8_pop2_rtn_b64", VReg_64, VReg_256>;
} // End OtherPredicates = [HasImageInsts].

defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">;
defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>;
defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
Expand Down Expand Up @@ -1268,6 +1279,11 @@ defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>;
defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>;
defm DS_BPERMUTE_FI_B32 : DS_Real_gfx12<0x0cd>;

defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
"ds_bvh_stack_push4_pop1_rtn_b32", true>;
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>;

// New aliases added in GFX12 without renaming the instructions.
let AssemblerPredicate = isGFX12Plus in {
def : AMDGPUMnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">;
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1457,7 +1457,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
Info.opc = ISD::INTRINSIC_W_CHAIN;

const GCNTargetMachine &TM =
Expand Down
93 changes: 93 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s

define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data0, <4 x i32> %data1) {
; CHECK-LABEL: test_ds_bvh_stack_push4_pop1:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5]
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: export prim v1, off, off, off done
; CHECK-NEXT: s_endpgm
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
ret void
}

define amdgpu_gs void @test_ds_bvh_stack_push4_pop1_1(i32 %addr, i32 %data0, <4 x i32> %data1) {
; CHECK-LABEL: test_ds_bvh_stack_push4_pop1_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: export prim v1, off, off, off done
; CHECK-NEXT: s_endpgm
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
ret void
}

define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data0, <8 x i32> %data1) {
; CHECK-LABEL: test_ds_bvh_stack_push8_pop1:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9]
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: export prim v1, off, off, off done
; CHECK-NEXT: s_endpgm
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
ret void
}

define amdgpu_gs void @test_ds_bvh_stack_push8_pop1_1(i32 %addr, i32 %data0, <8 x i32> %data1) {
; CHECK-LABEL: test_ds_bvh_stack_push8_pop1_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: export prim v1, off, off, off done
; CHECK-NEXT: s_endpgm
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
ret void
}

define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
; CHECK-LABEL: test_ds_bvh_stack_push8_pop2:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9]
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: export prim v1, off, off, off done
; CHECK-NEXT: s_endpgm
%pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
%vdst = extractvalue { i64, i32 } %pair, 0
%newaddr = extractvalue { i64, i32 } %pair, 1
%vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
%vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
%vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false)
ret void
}

define amdgpu_gs void @test_ds_bvh_stack_push8_pop2_1(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
; CHECK-LABEL: test_ds_bvh_stack_push8_pop2_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9] offset:1
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: export prim v1, off, off, off done
; CHECK-NEXT: s_endpgm
%pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
%vdst = extractvalue { i64, i32 } %pair, 0
%newaddr = extractvalue { i64, i32 } %pair, 1
%vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
%vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
%vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false)
ret void
}
18 changes: 18 additions & 0 deletions llvm/test/MC/AMDGPU/gfx12_asm_ds.s
Original file line number Diff line number Diff line change
Expand Up @@ -1922,3 +1922,21 @@ ds_bpermute_fi_b32 v5, v1, v2 offset:0

ds_bpermute_fi_b32 v255, v255, v255 offset:4
// GFX12: encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff]

ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5]
// GFX12: encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]

ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1
// GFX12: encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]

ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9]
// GFX12: encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]

ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1
// GFX12: encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]

ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251]
// GFX12: encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]

ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127
// GFX12: encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
3 changes: 3 additions & 0 deletions llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ ds_subrev_u64 v1, v[2:3]

ds_subrev_rtn_u64 v[5:6], v1, v[2:3]
// GFX12: ds_rsub_rtn_u64 v[5:6], v1, v[2:3] ; encoding: [0x00,0x00,0x88,0xd9,0x01,0x02,0x00,0x05]

ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5]
// GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
24 changes: 24 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3242,3 +3242,27 @@

# GFX12: ds_bpermute_fi_b32 v255, v255, v255 offset:4 ; encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff]
0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff

# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01

# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 ; encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01

# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] ; encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01

# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 ; encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01

# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] ; encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe

# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe

# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01]
0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01

# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] ; encoding: [0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01]
0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01