Skip to content

Commit 27a62f6

Browse files
committed
[AMDGPU] global-isel support for RT
Differential Revision: https://reviews.llvm.org/D87847
1 parent a9fca98 commit 27a62f6

File tree

7 files changed

+331
-0
lines changed

7 files changed

+331
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3019,6 +3019,13 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
30193019
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
30203020
}
30213021

3022+
bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3023+
MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3024+
MI.RemoveOperand(1);
3025+
MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3026+
return true;
3027+
}
3028+
30223029
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
30233030
if (I.isPHI())
30243031
return selectPHI(I);
@@ -3138,6 +3145,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
31383145
assert(Intr && "not an image intrinsic with image pseudo");
31393146
return selectImageIntrinsic(I, Intr);
31403147
}
3148+
case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3149+
return selectBVHIntrinsic(I);
31413150
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
31423151
return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
31433152
default:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
143143
bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
144144
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
145145
bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;
146+
bool selectBVHIntrinsic(MachineInstr &I) const;
146147

147148
std::pair<Register, unsigned>
148149
selectVOP3ModsImpl(MachineOperand &Root) const;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4488,6 +4488,78 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
44884488
return true;
44894489
}
44904490

4491+
bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4492+
MachineIRBuilder &B) const {
4493+
MachineRegisterInfo &MRI = *B.getMRI();
4494+
const LLT S16 = LLT::scalar(16);
4495+
const LLT S32 = LLT::scalar(32);
4496+
4497+
Register DstReg = MI.getOperand(0).getReg();
4498+
Register NodePtr = MI.getOperand(2).getReg();
4499+
Register RayExtent = MI.getOperand(3).getReg();
4500+
Register RayOrigin = MI.getOperand(4).getReg();
4501+
Register RayDir = MI.getOperand(5).getReg();
4502+
Register RayInvDir = MI.getOperand(6).getReg();
4503+
Register TDescr = MI.getOperand(7).getReg();
4504+
4505+
bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4506+
bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4507+
unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4508+
: AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4509+
: Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4510+
: AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4511+
4512+
SmallVector<Register, 12> Ops;
4513+
if (Is64) {
4514+
auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4515+
Ops.push_back(Unmerge.getReg(0));
4516+
Ops.push_back(Unmerge.getReg(1));
4517+
} else {
4518+
Ops.push_back(NodePtr);
4519+
}
4520+
Ops.push_back(RayExtent);
4521+
4522+
auto packLanes = [&Ops, &S32, &B] (Register Src) {
4523+
auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4524+
Ops.push_back(Unmerge.getReg(0));
4525+
Ops.push_back(Unmerge.getReg(1));
4526+
Ops.push_back(Unmerge.getReg(2));
4527+
};
4528+
4529+
packLanes(RayOrigin);
4530+
if (IsA16) {
4531+
auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4532+
auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4533+
Register R1 = MRI.createGenericVirtualRegister(S32);
4534+
Register R2 = MRI.createGenericVirtualRegister(S32);
4535+
Register R3 = MRI.createGenericVirtualRegister(S32);
4536+
B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4537+
B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4538+
B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4539+
Ops.push_back(R1);
4540+
Ops.push_back(R2);
4541+
Ops.push_back(R3);
4542+
} else {
4543+
packLanes(RayDir);
4544+
packLanes(RayInvDir);
4545+
}
4546+
4547+
auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4548+
.addDef(DstReg)
4549+
.addImm(Opcode);
4550+
4551+
for (Register R : Ops) {
4552+
MIB.addUse(R);
4553+
}
4554+
4555+
MIB.addUse(TDescr)
4556+
.addImm(IsA16 ? 1 : 0)
4557+
.cloneMemRefs(MI);
4558+
4559+
MI.eraseFromParent();
4560+
return true;
4561+
}
4562+
44914563
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
44924564
MachineInstr &MI) const {
44934565
MachineIRBuilder &B = Helper.MIRBuilder;
@@ -4695,6 +4767,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
46954767
case Intrinsic::amdgcn_ds_fmin:
46964768
case Intrinsic::amdgcn_ds_fmax:
46974769
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4770+
case Intrinsic::amdgcn_image_bvh_intersect_ray:
4771+
return legalizeBVHIntrinsic(MI, B);
46984772
default: {
46994773
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
47004774
AMDGPU::getImageDimIntrinsicInfo(IntrID))

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
163163
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
164164
Intrinsic::ID IID) const;
165165

166+
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
167+
166168
bool legalizeImageIntrinsic(
167169
MachineInstr &MI, MachineIRBuilder &B,
168170
GISelChangeObserver &Observer,

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3053,6 +3053,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
30533053
applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
30543054
return;
30553055
}
3056+
case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3057+
unsigned N = MI.getNumExplicitOperands() - 2;
3058+
executeInWaterfallLoop(MI, MRI, { N });
3059+
return;
3060+
}
30563061
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
30573062
auto IntrID = MI.getIntrinsicID();
30583063
switch (IntrID) {
@@ -4242,6 +4247,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
42424247
assert(RSrcIntrin->IsImage);
42434248
return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
42444249
}
4250+
case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4251+
unsigned N = MI.getNumExplicitOperands() - 2;
4252+
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4253+
OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4254+
for (unsigned I = 2; I < N; ++I)
4255+
OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4256+
break;
4257+
}
42454258
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
42464259
auto IntrID = MI.getIntrinsicID();
42474260
switch (IntrID) {

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2487,3 +2487,11 @@ def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
24872487
let hasSideEffects = 0;
24882488
let mayStore = 1;
24892489
}
2490+
2491+
def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
2492+
let OutOperandList = (outs type0:$dst);
2493+
let InOperandList = (ins unknown:$intrin, variable_ops);
2494+
let hasSideEffects = 0;
2495+
let mayLoad = 1;
2496+
let mayStore = 0;
2497+
}
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3+
4+
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
5+
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
6+
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
7+
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
8+
9+
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
10+
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
11+
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
12+
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
13+
14+
define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
15+
; GCN-LABEL: image_bvh_intersect_ray:
16+
; GCN: ; %bb.0:
17+
; GCN-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
18+
; GCN-NEXT: ; implicit-def: $vcc_hi
19+
; GCN-NEXT: s_waitcnt vmcnt(0)
20+
; GCN-NEXT: ; return to shader part epilog
21+
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
22+
%r = bitcast <4 x i32> %v to <4 x float>
23+
ret <4 x float> %r
24+
}
25+
26+
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
27+
; GCN-LABEL: image_bvh_intersect_ray_a16:
28+
; GCN: ; %bb.0:
29+
; GCN-NEXT: s_mov_b32 s4, 0xffff
30+
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6
31+
; GCN-NEXT: v_and_b32_e32 v10, s4, v8
32+
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
33+
; GCN-NEXT: v_and_b32_e32 v9, s4, v9
34+
; GCN-NEXT: ; implicit-def: $vcc_hi
35+
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
36+
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
37+
; GCN-NEXT: v_and_or_b32 v5, v6, s4, v5
38+
; GCN-NEXT: v_and_or_b32 v6, v7, s4, v10
39+
; GCN-NEXT: v_lshl_or_b32 v7, v9, 16, v8
40+
; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
41+
; GCN-NEXT: s_waitcnt vmcnt(0)
42+
; GCN-NEXT: ; return to shader part epilog
43+
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
44+
%r = bitcast <4 x i32> %v to <4 x float>
45+
ret <4 x float> %r
46+
}
47+
48+
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
49+
; GCN-LABEL: image_bvh64_intersect_ray:
50+
; GCN: ; %bb.0:
51+
; GCN-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
52+
; GCN-NEXT: ; implicit-def: $vcc_hi
53+
; GCN-NEXT: s_waitcnt vmcnt(0)
54+
; GCN-NEXT: ; return to shader part epilog
55+
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
56+
%r = bitcast <4 x i32> %v to <4 x float>
57+
ret <4 x float> %r
58+
}
59+
60+
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
61+
; GCN-LABEL: image_bvh64_intersect_ray_a16:
62+
; GCN: ; %bb.0:
63+
; GCN-NEXT: s_mov_b32 s4, 0xffff
64+
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7
65+
; GCN-NEXT: v_and_b32_e32 v11, s4, v9
66+
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
67+
; GCN-NEXT: v_and_b32_e32 v10, s4, v10
68+
; GCN-NEXT: ; implicit-def: $vcc_hi
69+
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
70+
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
71+
; GCN-NEXT: v_and_or_b32 v6, v7, s4, v6
72+
; GCN-NEXT: v_and_or_b32 v7, v8, s4, v11
73+
; GCN-NEXT: v_lshl_or_b32 v8, v10, 16, v9
74+
; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
75+
; GCN-NEXT: s_waitcnt vmcnt(0)
76+
; GCN-NEXT: ; return to shader part epilog
77+
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
78+
%r = bitcast <4 x i32> %v to <4 x float>
79+
ret <4 x float> %r
80+
}
81+
82+
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
83+
; GCN-LABEL: image_bvh_intersect_ray_vgpr_descr:
84+
; GCN: ; %bb.0:
85+
; GCN-NEXT: s_mov_b32 s1, exec_lo
86+
; GCN-NEXT: ; implicit-def: $vcc_hi
87+
; GCN-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1
88+
; GCN-NEXT: v_readfirstlane_b32 s4, v14
89+
; GCN-NEXT: v_readfirstlane_b32 s5, v15
90+
; GCN-NEXT: v_readfirstlane_b32 s6, v16
91+
; GCN-NEXT: v_readfirstlane_b32 s7, v17
92+
; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
93+
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
94+
; GCN-NEXT: s_nop 2
95+
; GCN-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
96+
; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
97+
; GCN-NEXT: s_and_saveexec_b32 s0, s0
98+
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
99+
; GCN-NEXT: s_cbranch_execnz BB4_1
100+
; GCN-NEXT: ; %bb.2:
101+
; GCN-NEXT: s_mov_b32 exec_lo, s1
102+
; GCN-NEXT: s_waitcnt vmcnt(0)
103+
; GCN-NEXT: v_mov_b32_e32 v0, v18
104+
; GCN-NEXT: v_mov_b32_e32 v1, v19
105+
; GCN-NEXT: v_mov_b32_e32 v2, v20
106+
; GCN-NEXT: v_mov_b32_e32 v3, v21
107+
; GCN-NEXT: ; return to shader part epilog
108+
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
109+
%r = bitcast <4 x i32> %v to <4 x float>
110+
ret <4 x float> %r
111+
}
112+
113+
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
114+
; GCN-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
115+
; GCN: ; %bb.0:
116+
; GCN-NEXT: s_mov_b32 s0, 0xffff
117+
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6
118+
; GCN-NEXT: v_and_b32_e32 v14, s0, v8
119+
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
120+
; GCN-NEXT: v_and_b32_e32 v15, s0, v9
121+
; GCN-NEXT: s_mov_b32 s1, exec_lo
122+
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
123+
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
124+
; GCN-NEXT: ; implicit-def: $vcc_hi
125+
; GCN-NEXT: v_lshl_or_b32 v15, v15, 16, v8
126+
; GCN-NEXT: v_and_or_b32 v9, v6, s0, v5
127+
; GCN-NEXT: v_and_or_b32 v14, v7, s0, v14
128+
; GCN-NEXT: BB5_1: ; =>This Inner Loop Header: Depth=1
129+
; GCN-NEXT: v_readfirstlane_b32 s4, v10
130+
; GCN-NEXT: v_readfirstlane_b32 s5, v11
131+
; GCN-NEXT: v_readfirstlane_b32 s6, v12
132+
; GCN-NEXT: v_readfirstlane_b32 s7, v13
133+
; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
134+
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
135+
; GCN-NEXT: s_nop 2
136+
; GCN-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16
137+
; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
138+
; GCN-NEXT: s_and_saveexec_b32 s0, s0
139+
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
140+
; GCN-NEXT: s_cbranch_execnz BB5_1
141+
; GCN-NEXT: ; %bb.2:
142+
; GCN-NEXT: s_mov_b32 exec_lo, s1
143+
; GCN-NEXT: s_waitcnt vmcnt(0)
144+
; GCN-NEXT: v_mov_b32_e32 v0, v5
145+
; GCN-NEXT: v_mov_b32_e32 v1, v6
146+
; GCN-NEXT: v_mov_b32_e32 v2, v7
147+
; GCN-NEXT: v_mov_b32_e32 v3, v8
148+
; GCN-NEXT: ; return to shader part epilog
149+
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
150+
%r = bitcast <4 x i32> %v to <4 x float>
151+
ret <4 x float> %r
152+
}
153+
154+
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
155+
; GCN-LABEL: image_bvh64_intersect_ray_vgpr_descr:
156+
; GCN: ; %bb.0:
157+
; GCN-NEXT: s_mov_b32 s1, exec_lo
158+
; GCN-NEXT: ; implicit-def: $vcc_hi
159+
; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
160+
; GCN-NEXT: v_readfirstlane_b32 s4, v15
161+
; GCN-NEXT: v_readfirstlane_b32 s5, v16
162+
; GCN-NEXT: v_readfirstlane_b32 s6, v17
163+
; GCN-NEXT: v_readfirstlane_b32 s7, v18
164+
; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
165+
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
166+
; GCN-NEXT: s_nop 2
167+
; GCN-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
168+
; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
169+
; GCN-NEXT: s_and_saveexec_b32 s0, s0
170+
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
171+
; GCN-NEXT: s_cbranch_execnz BB6_1
172+
; GCN-NEXT: ; %bb.2:
173+
; GCN-NEXT: s_mov_b32 exec_lo, s1
174+
; GCN-NEXT: s_waitcnt vmcnt(0)
175+
; GCN-NEXT: v_mov_b32_e32 v0, v19
176+
; GCN-NEXT: v_mov_b32_e32 v1, v20
177+
; GCN-NEXT: v_mov_b32_e32 v2, v21
178+
; GCN-NEXT: v_mov_b32_e32 v3, v22
179+
; GCN-NEXT: ; return to shader part epilog
180+
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
181+
%r = bitcast <4 x i32> %v to <4 x float>
182+
ret <4 x float> %r
183+
}
184+
185+
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
186+
; GCN-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
187+
; GCN: ; %bb.0:
188+
; GCN-NEXT: s_mov_b32 s0, 0xffff
189+
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7
190+
; GCN-NEXT: v_and_b32_e32 v15, s0, v9
191+
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
192+
; GCN-NEXT: v_and_b32_e32 v16, s0, v10
193+
; GCN-NEXT: s_mov_b32 s1, exec_lo
194+
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
195+
; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
196+
; GCN-NEXT: ; implicit-def: $vcc_hi
197+
; GCN-NEXT: v_lshl_or_b32 v16, v16, 16, v9
198+
; GCN-NEXT: v_and_or_b32 v10, v7, s0, v6
199+
; GCN-NEXT: v_and_or_b32 v15, v8, s0, v15
200+
; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1
201+
; GCN-NEXT: v_readfirstlane_b32 s4, v11
202+
; GCN-NEXT: v_readfirstlane_b32 s5, v12
203+
; GCN-NEXT: v_readfirstlane_b32 s6, v13
204+
; GCN-NEXT: v_readfirstlane_b32 s7, v14
205+
; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
206+
; GCN-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
207+
; GCN-NEXT: s_nop 2
208+
; GCN-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16
209+
; GCN-NEXT: s_and_b32 s0, s0, vcc_lo
210+
; GCN-NEXT: s_and_saveexec_b32 s0, s0
211+
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s0
212+
; GCN-NEXT: s_cbranch_execnz BB7_1
213+
; GCN-NEXT: ; %bb.2:
214+
; GCN-NEXT: s_mov_b32 exec_lo, s1
215+
; GCN-NEXT: s_waitcnt vmcnt(0)
216+
; GCN-NEXT: v_mov_b32_e32 v0, v6
217+
; GCN-NEXT: v_mov_b32_e32 v1, v7
218+
; GCN-NEXT: v_mov_b32_e32 v2, v8
219+
; GCN-NEXT: v_mov_b32_e32 v3, v9
220+
; GCN-NEXT: ; return to shader part epilog
221+
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
222+
%r = bitcast <4 x i32> %v to <4 x float>
223+
ret <4 x float> %r
224+
}

0 commit comments

Comments
 (0)