Skip to content

[AMDGPU][Clang] Add builtins for gfx12 ray tracing intrinsics #135224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,18 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "g

TARGET_BUILTIN(__builtin_amdgcn_ds_bpermute_fi_b32, "iii", "nc", "gfx12-insts")

// For the following two builtins, the second and third return values of the
// intrinsics are returned through the last two pointer-type function arguments.
TARGET_BUILTIN(__builtin_amdgcn_image_bvh8_intersect_ray, "V10UiWUifUcV3fV3fUiV4UiV3f*V3f*", "nc", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_image_bvh_dual_intersect_ray, "V10UiWUifUcV3fV3fV2UiV4UiV3f*V3f*", "nc", "gfx12-insts")

TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn, "V2UiUiUiV4UiIi", "n", "gfx11-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn, "V2UiUiUiV8UiIi", "n", "gfx12-insts")

// The intrinsic returns {i64, i32}, the builtin returns <2 x i64>.
// The second return value of the intrinsic is zext'ed.
TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi", "n", "gfx12-insts")

//===----------------------------------------------------------------------===//
// WMMA builtins.
// Postfix w32 indicates the builtin requires wavefront size of 32.
Expand Down
66 changes: 64 additions & 2 deletions clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,19 +616,81 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
RayInverseDir, TextureDescr});
}
case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
break;
case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
break;
}
llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));

Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));

llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);

llvm::CallInst *CI = Builder.CreateCall(
IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
Offset, TextureDescr});

llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);

Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
Builder.CreateStore(RetRayDir, RetRayDirPtr);

return RetVData;
}

case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
break;
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
break;
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
break;
case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
break;
}

case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: {
SmallVector<Value *, 4> Args;
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
Args.push_back(EmitScalarExpr(E->getArg(i)));

Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn);
Function *F = CGM.getIntrinsic(IID);
Value *Call = Builder.CreateCall(F, Args);
Value *Rtn = Builder.CreateExtractValue(Call, 0);
Value *A = Builder.CreateExtractValue(Call, 1);
llvm::Type *RetTy = ConvertType(E->getType());
Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
(uint64_t)0);
// ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
// <2 x i64>, zext the second value.
if (A->getType()->getPrimitiveSizeInBits() <
RetTy->getScalarType()->getPrimitiveSizeInBits())
A = Builder.CreateZExt(A, RetTy->getScalarType());

return Builder.CreateInsertElement(I0, A, 1);
}
case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
Expand Down
78 changes: 78 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
// RUN: -emit-llvm -cl-std=CL2.0 -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S \
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=ISA %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm \
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12 %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S \
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12ISA %s

// Test llvm.amdgcn.image.bvh.intersect.ray intrinsic.

Expand All @@ -12,12 +16,18 @@
// Postfix l indicates the 1st argument is i64 and postfix h indicates
// the 4/5-th arguments are half4.

typedef unsigned char uchar;
typedef unsigned int uint;
typedef unsigned long ulong;
typedef float float3 __attribute__((ext_vector_type(3)));
typedef float float4 __attribute__((ext_vector_type(4)));
typedef double double4 __attribute__((ext_vector_type(4)));
typedef half half4 __attribute__((ext_vector_type(4)));
typedef uint uint2 __attribute__((ext_vector_type(2)));
typedef uint uint4 __attribute__((ext_vector_type(4)));
typedef uint uint8 __attribute__((ext_vector_type(8)));
typedef uint uint10 __attribute__((ext_vector_type(10)));
typedef ulong ulong2 __attribute__((ext_vector_type(2)));

// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32
// ISA: image_bvh_intersect_ray
Expand Down Expand Up @@ -59,3 +69,71 @@ void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr,
ray_origin, ray_dir, ray_inv_dir, texture_descr);
}

#if __has_builtin(__builtin_amdgcn_image_bvh8_intersect_ray)
// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh8.intersect.ray(
// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
// GFX12: <3 x float> %ray_dir, i32 %offset, <4 x i32> %texture_descr)
// GFX12ISA: image_bvh8_intersect_ray
void test_image_bvh8_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
float3 ray_origin, float3 ray_dir, uint offset, uint4 texture_descr)
{
*ret_vdata = __builtin_amdgcn_image_bvh8_intersect_ray(node_ptr, ray_extent,
instance_mask, ray_origin, ray_dir, offset, texture_descr,
ret_ray_origin, ret_ray_dir);
}
#endif

#if __has_builtin(__builtin_amdgcn_image_bvh_dual_intersect_ray)
// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh.dual.intersect.ray(
// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
// GFX12: <3 x float> %ray_dir, <2 x i32> %offset, <4 x i32> %texture_descr)
// GFX12ISA: image_bvh_dual_intersect_ray
void test_builtin_amdgcn_image_bvh_dual_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
float3 ray_origin, float3 ray_dir, uint2 offset, uint4 texture_descr)
{
*ret_vdata = __builtin_amdgcn_image_bvh_dual_intersect_ray(node_ptr, ray_extent,
instance_mask, ray_origin, ray_dir, offset, texture_descr,
ret_ray_origin, ret_ray_dir);
}
#endif

#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn)
// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(
// GFX12: i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
// GFX12ISA: ds_bvh_stack_push4_pop1_rtn
void test_builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(uint* ret_vdst, uint* ret_addr,
uint addr, uint data0, uint4 data1)
{
uint2 ret = __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
*ret_vdst = ret.x;
*ret_addr = ret.y;
}
#endif

#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn)
// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(
// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
// GFX12ISA: ds_bvh_stack_push8_pop1_rtn
void test_builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(uint* ret_vdst, uint* ret_addr,
uint addr, uint data0, uint8 data1)
{
uint2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
*ret_vdst = ret.x;
*ret_addr = ret.y;
}
#endif

#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn)
// GFX12: call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(
// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
// GFX12ISA: ds_bvh_stack_push8_pop2_rtn
void test_builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(ulong* ret_vdst, uint* ret_addr,
uint addr, uint data0, uint8 data1)
{
ulong2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(addr, data0, data1, /*constant offset=*/0);
*ret_vdst = ret.x;
*ret_addr = ret.y;
}
#endif