Skip to content

Commit 9e90e10

Browse files
authored
[AMDGPU][Clang] Add builtins for gfx12 ray tracing intrinsics (#135224)
1 parent b658a2e commit 9e90e10

File tree

3 files changed

+154
-2
lines changed

3 files changed

+154
-2
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,18 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "g
506506

507507
TARGET_BUILTIN(__builtin_amdgcn_ds_bpermute_fi_b32, "iii", "nc", "gfx12-insts")
508508

509+
// For the following two builtins, the second and third return values of the
510+
// intrinsics are returned through the last two pointer-type function arguments.
511+
TARGET_BUILTIN(__builtin_amdgcn_image_bvh8_intersect_ray, "V10UiWUifUcV3fV3fUiV4UiV3f*V3f*", "nc", "gfx12-insts")
512+
TARGET_BUILTIN(__builtin_amdgcn_image_bvh_dual_intersect_ray, "V10UiWUifUcV3fV3fV2UiV4UiV3f*V3f*", "nc", "gfx12-insts")
513+
514+
TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn, "V2UiUiUiV4UiIi", "n", "gfx11-insts")
515+
TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn, "V2UiUiUiV8UiIi", "n", "gfx12-insts")
516+
517+
// The intrinsic returns {i64, i32}, the builtin returns <2 x i64>.
518+
// The second return value of the intrinsic is zext'ed.
519+
TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi", "n", "gfx12-insts")
520+
509521
//===----------------------------------------------------------------------===//
510522
// WMMA builtins.
511523
// Postfix w32 indicates the builtin requires wavefront size of 32.

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -616,19 +616,81 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
616616
return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
617617
RayInverseDir, TextureDescr});
618618
}
619+
case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
620+
case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
621+
Intrinsic::ID IID;
622+
switch (BuiltinID) {
623+
case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
624+
IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
625+
break;
626+
case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
627+
IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
628+
break;
629+
}
630+
llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
631+
llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
632+
llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
633+
llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
634+
llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
635+
llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
636+
llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));
637+
638+
Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
639+
Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));
640+
641+
llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
642+
643+
llvm::CallInst *CI = Builder.CreateCall(
644+
IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
645+
Offset, TextureDescr});
646+
647+
llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
648+
llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
649+
llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);
650+
651+
Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
652+
Builder.CreateStore(RetRayDir, RetRayDirPtr);
653+
654+
return RetVData;
655+
}
656+
657+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
658+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
659+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
660+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
661+
Intrinsic::ID IID;
662+
switch (BuiltinID) {
663+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
664+
IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
665+
break;
666+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
667+
IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
668+
break;
669+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
670+
IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
671+
break;
672+
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
673+
IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
674+
break;
675+
}
619676

620-
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: {
621677
SmallVector<Value *, 4> Args;
622678
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
623679
Args.push_back(EmitScalarExpr(E->getArg(i)));
624680

625-
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn);
681+
Function *F = CGM.getIntrinsic(IID);
626682
Value *Call = Builder.CreateCall(F, Args);
627683
Value *Rtn = Builder.CreateExtractValue(Call, 0);
628684
Value *A = Builder.CreateExtractValue(Call, 1);
629685
llvm::Type *RetTy = ConvertType(E->getType());
630686
Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
631687
(uint64_t)0);
688+
// ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
689+
// <2 x i64>, zext the second value.
690+
if (A->getType()->getPrimitiveSizeInBits() <
691+
RetTy->getScalarType()->getPrimitiveSizeInBits())
692+
A = Builder.CreateZExt(A, RetTy->getScalarType());
693+
632694
return Builder.CreateInsertElement(I0, A, 1);
633695
}
634696
case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:

clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
// RUN: -emit-llvm -cl-std=CL2.0 -o - %s | FileCheck %s
44
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S \
55
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=ISA %s
6+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm \
7+
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12 %s
8+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S \
9+
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12ISA %s
610

711
// Test llvm.amdgcn.image.bvh.intersect.ray intrinsic.
812

@@ -12,12 +16,18 @@
1216
// Postfix l indicates the 1st argument is i64 and postfix h indicates
1317
// the 4/5-th arguments are half4.
1418

19+
typedef unsigned char uchar;
1520
typedef unsigned int uint;
1621
typedef unsigned long ulong;
22+
typedef float float3 __attribute__((ext_vector_type(3)));
1723
typedef float float4 __attribute__((ext_vector_type(4)));
1824
typedef double double4 __attribute__((ext_vector_type(4)));
1925
typedef half half4 __attribute__((ext_vector_type(4)));
26+
typedef uint uint2 __attribute__((ext_vector_type(2)));
2027
typedef uint uint4 __attribute__((ext_vector_type(4)));
28+
typedef uint uint8 __attribute__((ext_vector_type(8)));
29+
typedef uint uint10 __attribute__((ext_vector_type(10)));
30+
typedef ulong ulong2 __attribute__((ext_vector_type(2)));
2131

2232
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32
2333
// ISA: image_bvh_intersect_ray
@@ -59,3 +69,71 @@ void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr,
5969
ray_origin, ray_dir, ray_inv_dir, texture_descr);
6070
}
6171

72+
#if __has_builtin(__builtin_amdgcn_image_bvh8_intersect_ray)
73+
// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh8.intersect.ray(
74+
// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
75+
// GFX12: <3 x float> %ray_dir, i32 %offset, <4 x i32> %texture_descr)
76+
// GFX12ISA: image_bvh8_intersect_ray
77+
void test_image_bvh8_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
78+
float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
79+
float3 ray_origin, float3 ray_dir, uint offset, uint4 texture_descr)
80+
{
81+
*ret_vdata = __builtin_amdgcn_image_bvh8_intersect_ray(node_ptr, ray_extent,
82+
instance_mask, ray_origin, ray_dir, offset, texture_descr,
83+
ret_ray_origin, ret_ray_dir);
84+
}
85+
#endif
86+
87+
#if __has_builtin(__builtin_amdgcn_image_bvh_dual_intersect_ray)
88+
// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh.dual.intersect.ray(
89+
// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
90+
// GFX12: <3 x float> %ray_dir, <2 x i32> %offset, <4 x i32> %texture_descr)
91+
// GFX12ISA: image_bvh_dual_intersect_ray
92+
void test_builtin_amdgcn_image_bvh_dual_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
93+
float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
94+
float3 ray_origin, float3 ray_dir, uint2 offset, uint4 texture_descr)
95+
{
96+
*ret_vdata = __builtin_amdgcn_image_bvh_dual_intersect_ray(node_ptr, ray_extent,
97+
instance_mask, ray_origin, ray_dir, offset, texture_descr,
98+
ret_ray_origin, ret_ray_dir);
99+
}
100+
#endif
101+
102+
#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn)
103+
// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(
104+
// GFX12: i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
105+
// GFX12ISA: ds_bvh_stack_push4_pop1_rtn
106+
void test_builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(uint* ret_vdst, uint* ret_addr,
107+
uint addr, uint data0, uint4 data1)
108+
{
109+
uint2 ret = __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
110+
*ret_vdst = ret.x;
111+
*ret_addr = ret.y;
112+
}
113+
#endif
114+
115+
#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn)
116+
// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(
117+
// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
118+
// GFX12ISA: ds_bvh_stack_push8_pop1_rtn
119+
void test_builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(uint* ret_vdst, uint* ret_addr,
120+
uint addr, uint data0, uint8 data1)
121+
{
122+
uint2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
123+
*ret_vdst = ret.x;
124+
*ret_addr = ret.y;
125+
}
126+
#endif
127+
128+
#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn)
129+
// GFX12: call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(
130+
// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
131+
// GFX12ISA: ds_bvh_stack_push8_pop2_rtn
132+
void test_builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(ulong* ret_vdst, uint* ret_addr,
133+
uint addr, uint data0, uint8 data1)
134+
{
135+
ulong2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(addr, data0, data1, /*constant offset=*/0);
136+
*ret_vdst = ret.x;
137+
*ret_addr = ret.y;
138+
}
139+
#endif

0 commit comments

Comments
 (0)