Skip to content

Commit 6b47bba

Browse files
[AMDGPU] Add intrinsics and MIs for ds_bvh_stack_* (#130007)
New intrinsics / instructions : int_amdgcn_ds_bvh_stack_push4_pop1_rtn / ds_bvh_stack_push4_pop1_rtn_b32 int_amdgcn_ds_bvh_stack_push8_pop1_rtn / ds_bvh_stack_push8_pop1_rtn_b32 int_amdgcn_ds_bvh_stack_push8_pop2_rtn / ds_bvh_stack_push8_pop2_rtn_b64 Co-authored-by: Mateja Marjanovic <[email protected]>
1 parent 3af6c9f commit 6b47bba

File tree

11 files changed

+218
-14
lines changed

11 files changed

+218
-14
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2714,18 +2714,21 @@ def int_amdgcn_ds_sub_gs_reg_rtn :
27142714
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
27152715
"", [SDNPMemOperand]>;
27162716

2717-
def int_amdgcn_ds_bvh_stack_rtn :
2717+
class IntDSBVHStackRtn<LLVMType vdst, LLVMType data1> :
27182718
Intrinsic<
2719-
[llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
2719+
[vdst, llvm_i32_ty], // %vdst, %addr
27202720
[
27212721
llvm_i32_ty, // %addr
27222722
llvm_i32_ty, // %data0
2723-
llvm_v4i32_ty, // %data1
2723+
data1, // %data1
27242724
llvm_i32_ty, // %offset
27252725
],
27262726
[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
27272727
>;
27282728

2729+
def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty,
2730+
data1 = llvm_v4i32_ty>;
2731+
27292732
def int_amdgcn_s_wait_event_export_ready :
27302733
ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
27312734
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
@@ -2801,6 +2804,15 @@ def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty
28012804
// GFX12 Intrinsics
28022805
//===----------------------------------------------------------------------===//
28032806

2807+
def int_amdgcn_ds_bvh_stack_push4_pop1_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty,
2808+
data1 = llvm_v4i32_ty>;
2809+
2810+
def int_amdgcn_ds_bvh_stack_push8_pop1_rtn : IntDSBVHStackRtn<vdst = llvm_i32_ty,
2811+
data1 = llvm_v8i32_ty>;
2812+
2813+
def int_amdgcn_ds_bvh_stack_push8_pop2_rtn : IntDSBVHStackRtn<vdst = llvm_i64_ty,
2814+
data1 = llvm_v8i32_ty>;
2815+
28042816
// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control>
28052817
def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
28062818
Intrinsic<[llvm_i32_ty],

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2664,8 +2664,20 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
26642664

26652665
// We need to handle this here because tablegen doesn't support matching
26662666
// instructions with multiple outputs.
2667-
void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2668-
unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2667+
void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2668+
unsigned Opc;
2669+
switch (IntrID) {
2670+
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2671+
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2672+
Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2673+
break;
2674+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2675+
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2676+
break;
2677+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2678+
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2679+
break;
2680+
}
26692681
SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
26702682
N->getOperand(5), N->getOperand(0)};
26712683

@@ -2829,7 +2841,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
28292841
return;
28302842
}
28312843
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2832-
SelectDSBvhStackIntrinsic(N);
2844+
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2845+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2846+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2847+
SelectDSBvhStackIntrinsic(N, IntrID);
28332848
return;
28342849
case Intrinsic::amdgcn_init_whole_wave:
28352850
CurDAG->getMachineFunction()

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
267267
void SelectFMAD_FMA(SDNode *N);
268268
void SelectFP_EXTEND(SDNode *N);
269269
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
270-
void SelectDSBvhStackIntrinsic(SDNode *N);
270+
void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID);
271271
void SelectDS_GWS(SDNode *N, unsigned IntrID);
272272
void SelectInterpP1F16(SDNode *N);
273273
void SelectINTRINSIC_W_CHAIN(SDNode *N);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2277,7 +2277,21 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
22772277
Register Data1 = MI.getOperand(5).getReg();
22782278
unsigned Offset = MI.getOperand(6).getImm();
22792279

2280-
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2280+
unsigned Opc;
2281+
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2282+
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2283+
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2284+
Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2285+
break;
2286+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2287+
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2288+
break;
2289+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2290+
Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2291+
break;
2292+
}
2293+
2294+
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
22812295
.addDef(Dst1)
22822296
.addUse(Addr)
22832297
.addUse(Data0)
@@ -2332,6 +2346,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
23322346
}
23332347
break;
23342348
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2349+
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2350+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2351+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
23352352
return selectDSBvhStackIntrinsic(I);
23362353
case Intrinsic::amdgcn_s_barrier_signal_var:
23372354
return selectNamedBarrierInit(I, IntrinsicID);

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5252,7 +5252,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
52525252
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
52535253
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
52545254
break;
5255-
case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
5255+
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
5256+
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
5257+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
5258+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
52565259
OpdsMapping[0] =
52575260
getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
52585261
OpdsMapping[1] =

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -258,10 +258,13 @@ multiclass DS_1A2D_Off8_RET_mc<string opName,
258258
}
259259
}
260260

261-
class DS_BVH_STACK<string opName>
261+
class DS_BVH_STACK<string opName,
262+
RegisterClass vdst_rc,
263+
RegisterClass data1_rc>
262264
: DS_Pseudo<opName,
263-
(outs getLdStRegisterOperand<VGPR_32>.ret:$vdst, VGPR_32:$addr),
264-
(ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0, VReg_128:$data1, Offset:$offset),
265+
(outs getLdStRegisterOperand<vdst_rc>.ret:$vdst, VGPR_32:$addr),
266+
(ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0,
267+
data1_rc:$data1, Offset:$offset),
265268
" $vdst, $addr, $data0, $data1$offset"> {
266269
let Constraints = "$addr = $addr_in";
267270
let DisableEncoding = "$addr_in";
@@ -722,7 +725,8 @@ def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
722725
let SubtargetPredicate = isGFX11Plus in {
723726

724727
let OtherPredicates = [HasImageInsts] in
725-
def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
728+
def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32",
729+
VGPR_32, VReg_128> ;
726730

727731
} // let SubtargetPredicate = isGFX11Plus
728732

@@ -732,6 +736,13 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
732736

733737
let SubtargetPredicate = isGFX12Plus in {
734738

739+
let OtherPredicates = [HasImageInsts] in {
740+
def DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_BVH_STACK<
741+
"ds_bvh_stack_push8_pop1_rtn_b32", VGPR_32, VReg_256>;
742+
def DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_BVH_STACK<
743+
"ds_bvh_stack_push8_pop2_rtn_b64", VReg_64, VReg_256>;
744+
} // End OtherPredicates = [HasImageInsts].
745+
735746
defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">;
736747
defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>;
737748
defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
@@ -1268,6 +1279,11 @@ defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>;
12681279
defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>;
12691280
defm DS_BPERMUTE_FI_B32 : DS_Real_gfx12<0x0cd>;
12701281

1282+
defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
1283+
"ds_bvh_stack_push4_pop1_rtn_b32", true>;
1284+
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
1285+
defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>;
1286+
12711287
// New aliases added in GFX12 without renaming the instructions.
12721288
let AssemblerPredicate = isGFX12Plus in {
12731289
def : AMDGPUMnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1457,7 +1457,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14571457
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
14581458
return true;
14591459
}
1460-
case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1460+
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1461+
case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1462+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1463+
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
14611464
Info.opc = ISD::INTRINSIC_W_CHAIN;
14621465

14631466
const GCNTargetMachine &TM =
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
3+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
4+
5+
define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data0, <4 x i32> %data1) {
6+
; CHECK-LABEL: test_ds_bvh_stack_push4_pop1:
7+
; CHECK: ; %bb.0:
8+
; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5]
9+
; CHECK-NEXT: s_wait_dscnt 0x0
10+
; CHECK-NEXT: export prim v1, off, off, off done
11+
; CHECK-NEXT: s_endpgm
12+
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
13+
%vdst = extractvalue { i32, i32 } %pair, 0
14+
%newaddr = extractvalue { i32, i32 } %pair, 1
15+
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
16+
ret void
17+
}
18+
19+
define amdgpu_gs void @test_ds_bvh_stack_push4_pop1_1(i32 %addr, i32 %data0, <4 x i32> %data1) {
20+
; CHECK-LABEL: test_ds_bvh_stack_push4_pop1_1:
21+
; CHECK: ; %bb.0:
22+
; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1
23+
; CHECK-NEXT: s_wait_dscnt 0x0
24+
; CHECK-NEXT: export prim v1, off, off, off done
25+
; CHECK-NEXT: s_endpgm
26+
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1)
27+
%vdst = extractvalue { i32, i32 } %pair, 0
28+
%newaddr = extractvalue { i32, i32 } %pair, 1
29+
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
30+
ret void
31+
}
32+
33+
define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data0, <8 x i32> %data1) {
34+
; CHECK-LABEL: test_ds_bvh_stack_push8_pop1:
35+
; CHECK: ; %bb.0:
36+
; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9]
37+
; CHECK-NEXT: s_wait_dscnt 0x0
38+
; CHECK-NEXT: export prim v1, off, off, off done
39+
; CHECK-NEXT: s_endpgm
40+
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
41+
%vdst = extractvalue { i32, i32 } %pair, 0
42+
%newaddr = extractvalue { i32, i32 } %pair, 1
43+
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
44+
ret void
45+
}
46+
47+
define amdgpu_gs void @test_ds_bvh_stack_push8_pop1_1(i32 %addr, i32 %data0, <8 x i32> %data1) {
48+
; CHECK-LABEL: test_ds_bvh_stack_push8_pop1_1:
49+
; CHECK: ; %bb.0:
50+
; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1
51+
; CHECK-NEXT: s_wait_dscnt 0x0
52+
; CHECK-NEXT: export prim v1, off, off, off done
53+
; CHECK-NEXT: s_endpgm
54+
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
55+
%vdst = extractvalue { i32, i32 } %pair, 0
56+
%newaddr = extractvalue { i32, i32 } %pair, 1
57+
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
58+
ret void
59+
}
60+
61+
define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
62+
; CHECK-LABEL: test_ds_bvh_stack_push8_pop2:
63+
; CHECK: ; %bb.0:
64+
; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9]
65+
; CHECK-NEXT: s_wait_dscnt 0x0
66+
; CHECK-NEXT: export prim v1, off, off, off done
67+
; CHECK-NEXT: s_endpgm
68+
%pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
69+
%vdst = extractvalue { i64, i32 } %pair, 0
70+
%newaddr = extractvalue { i64, i32 } %pair, 1
71+
%vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
72+
%vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
73+
%vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
74+
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false)
75+
ret void
76+
}
77+
78+
define amdgpu_gs void @test_ds_bvh_stack_push8_pop2_1(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
79+
; CHECK-LABEL: test_ds_bvh_stack_push8_pop2_1:
80+
; CHECK: ; %bb.0:
81+
; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9] offset:1
82+
; CHECK-NEXT: s_wait_dscnt 0x0
83+
; CHECK-NEXT: export prim v1, off, off, off done
84+
; CHECK-NEXT: s_endpgm
85+
%pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
86+
%vdst = extractvalue { i64, i32 } %pair, 0
87+
%newaddr = extractvalue { i64, i32 } %pair, 1
88+
%vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
89+
%vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
90+
%vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
91+
call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false)
92+
ret void
93+
}

llvm/test/MC/AMDGPU/gfx12_asm_ds.s

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1922,3 +1922,21 @@ ds_bpermute_fi_b32 v5, v1, v2 offset:0
19221922

19231923
ds_bpermute_fi_b32 v255, v255, v255 offset:4
19241924
// GFX12: encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff]
1925+
1926+
ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5]
1927+
// GFX12: encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
1928+
1929+
ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1
1930+
// GFX12: encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
1931+
1932+
ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9]
1933+
// GFX12: encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
1934+
1935+
ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1
1936+
// GFX12: encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
1937+
1938+
ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251]
1939+
// GFX12: encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
1940+
1941+
ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127
1942+
// GFX12: encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]

llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,6 @@ ds_subrev_u64 v1, v[2:3]
3535

3636
ds_subrev_rtn_u64 v[5:6], v1, v[2:3]
3737
// GFX12: ds_rsub_rtn_u64 v[5:6], v1, v[2:3] ; encoding: [0x00,0x00,0x88,0xd9,0x01,0x02,0x00,0x05]
38+
39+
ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5]
40+
// GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]

llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3242,3 +3242,27 @@
32423242

32433243
# GFX12: ds_bpermute_fi_b32 v255, v255, v255 offset:4 ; encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff]
32443244
0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff
3245+
3246+
# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
3247+
0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01
3248+
3249+
# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 ; encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
3250+
0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01
3251+
3252+
# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] ; encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
3253+
0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01
3254+
3255+
# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 ; encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
3256+
0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01
3257+
3258+
# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] ; encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
3259+
0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe
3260+
3261+
# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
3262+
0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe
3263+
3264+
# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01]
3265+
0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01
3266+
3267+
# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] ; encoding: [0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01]
3268+
0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01

0 commit comments

Comments
 (0)