-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][Clang] Add builtins for gfx12 ray tracing intrinsics #135224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][Clang] Add builtins for gfx12 ray tracing intrinsics #135224
Conversation
__builtin_amdgcn_image_bvh8_intersect_ray __builtin_amdgcn_image_bvh_dual_intersect_ray For the above two builtins, the second and third return values of the intrinsics are returned through pointer-type function arguments. __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn For the last builtin, the intrinsic returns `{i64, i32}`, the builtin returns `<2 x i64>`. The second return value of the intrinsic is zext'ed.
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) Changes__builtin_amdgcn_image_bvh8_intersect_ray For the above two builtins, the second and third return values of the intrinsics __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn For the last builtin, the intrinsic returns Full diff: https://github.com/llvm/llvm-project/pull/135224.diff 3 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index cbef637be213a..39fef9e4601f8 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -506,6 +506,18 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "g
TARGET_BUILTIN(__builtin_amdgcn_ds_bpermute_fi_b32, "iii", "nc", "gfx12-insts")
+// For the following two builtins, the second and third return values of the
+// intrinsics are returned through the last two pointer-type function arguments.
+TARGET_BUILTIN(__builtin_amdgcn_image_bvh8_intersect_ray, "V10UiWUifUcV3fV3fUiV4UiV3f*V3f*", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_bvh_dual_intersect_ray, "V10UiWUifUcV3fV3fV2UiV4UiV3f*V3f*", "nc", "gfx12-insts")
+
+TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn, "V2UiUiUiV4UiIi", "n", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn, "V2UiUiUiV8UiIi", "n", "gfx12-insts")
+
+// The intrinsic returns {i64, i32}, the builtin returns <2 x i64>.
+// The second return value of the intrinsic is zext'ed.
+TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi", "n", "gfx12-insts")
+
//===----------------------------------------------------------------------===//
// WMMA builtins.
// Postfix w32 indicates the builtin requires wavefront size of 32.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index b56b739094ff3..35c9f8ae48c80 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -616,19 +616,81 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
RayInverseDir, TextureDescr});
}
+ case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
+ case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
+ IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
+ IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
+ break;
+ }
+ llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
+ llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
+ llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
+ llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
+ llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
+ llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
+ llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));
+
+ Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
+ Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));
+
+ llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
+
+ llvm::CallInst *CI = Builder.CreateCall(
+ IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
+ Offset, TextureDescr});
+
+ llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
+ llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
+ llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);
+
+ Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
+ Builder.CreateStore(RetRayDir, RetRayDirPtr);
+
+ return RetVData;
+ }
+
+ case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
+ case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
+ case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
+ case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
+ IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
+ IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
+ IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
+ IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
+ break;
+ }
- case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: {
SmallVector<Value *, 4> Args;
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
Args.push_back(EmitScalarExpr(E->getArg(i)));
- Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn);
+ Function *F = CGM.getIntrinsic(IID);
Value *Call = Builder.CreateCall(F, Args);
Value *Rtn = Builder.CreateExtractValue(Call, 0);
Value *A = Builder.CreateExtractValue(Call, 1);
llvm::Type *RetTy = ConvertType(E->getType());
Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
(uint64_t)0);
+ // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
+ // <2 x i64>, zext the second value.
+ if (A->getType()->getPrimitiveSizeInBits() <
+ RetTy->getScalarType()->getPrimitiveSizeInBits())
+ A = Builder.CreateZExt(A, RetTy->getScalarType());
+
return Builder.CreateInsertElement(I0, A, 1);
}
case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
index 7f73cdd61c80d..2cf7f3dc6f80e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
@@ -3,6 +3,10 @@
// RUN: -emit-llvm -cl-std=CL2.0 -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S \
// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=ISA %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm \
+// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12 %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S \
+// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12ISA %s
// Test llvm.amdgcn.image.bvh.intersect.ray intrinsic.
@@ -12,12 +16,18 @@
// Postfix l indicates the 1st argument is i64 and postfix h indicates
// the 4/5-th arguments are half4.
+typedef unsigned char uchar;
typedef unsigned int uint;
typedef unsigned long ulong;
+typedef float float3 __attribute__((ext_vector_type(3)));
typedef float float4 __attribute__((ext_vector_type(4)));
typedef double double4 __attribute__((ext_vector_type(4)));
typedef half half4 __attribute__((ext_vector_type(4)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef uint uint8 __attribute__((ext_vector_type(8)));
+typedef uint uint10 __attribute__((ext_vector_type(10)));
+typedef ulong ulong2 __attribute__((ext_vector_type(2)));
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32
// ISA: image_bvh_intersect_ray
@@ -59,3 +69,71 @@ void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr,
ray_origin, ray_dir, ray_inv_dir, texture_descr);
}
+#if __has_builtin(__builtin_amdgcn_image_bvh8_intersect_ray)
+// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh8.intersect.ray(
+// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
+// GFX12: <3 x float> %ray_dir, i32 %offset, <4 x i32> %texture_descr)
+// GFX12ISA: image_bvh8_intersect_ray
+void test_image_bvh8_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
+ float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
+ float3 ray_origin, float3 ray_dir, uint offset, uint4 texture_descr)
+{
+ *ret_vdata = __builtin_amdgcn_image_bvh8_intersect_ray(node_ptr, ray_extent,
+ instance_mask, ray_origin, ray_dir, offset, texture_descr,
+ ret_ray_origin, ret_ray_dir);
+}
+#endif
+
+#if __has_builtin(__builtin_amdgcn_image_bvh_dual_intersect_ray)
+// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh.dual.intersect.ray(
+// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
+// GFX12: <3 x float> %ray_dir, <2 x i32> %offset, <4 x i32> %texture_descr)
+// GFX12ISA: image_bvh_dual_intersect_ray
+void test_builtin_amdgcn_image_bvh_dual_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
+ float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
+ float3 ray_origin, float3 ray_dir, uint2 offset, uint4 texture_descr)
+{
+ *ret_vdata = __builtin_amdgcn_image_bvh_dual_intersect_ray(node_ptr, ray_extent,
+ instance_mask, ray_origin, ray_dir, offset, texture_descr,
+ ret_ray_origin, ret_ray_dir);
+}
+#endif
+
+#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn)
+// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(
+// GFX12: i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
+// GFX12ISA: ds_bvh_stack_push4_pop1_rtn
+void test_builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(uint* ret_vdst, uint* ret_addr,
+ uint addr, uint data0, uint4 data1)
+{
+ uint2 ret = __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
+ *ret_vdst = ret.x;
+ *ret_addr = ret.y;
+}
+#endif
+
+#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn)
+// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(
+// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
+// GFX12ISA: ds_bvh_stack_push8_pop1_rtn
+void test_builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(uint* ret_vdst, uint* ret_addr,
+ uint addr, uint data0, uint8 data1)
+{
+ uint2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
+ *ret_vdst = ret.x;
+ *ret_addr = ret.y;
+}
+#endif
+
+#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn)
+// GFX12: call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(
+// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
+// GFX12ISA: ds_bvh_stack_push8_pop2_rtn
+void test_builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(ulong* ret_vdst, uint* ret_addr,
+ uint addr, uint data0, uint8 data1)
+{
+ ulong2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(addr, data0, data1, /*constant offset=*/0);
+ *ret_vdst = ret.x;
+ *ret_addr = ret.y;
+}
+#endif
|
FWIW, this is part of the gfx12 upstream. |
For the above two builtins, the second and third return values of the intrinsics
are returned through pointer-type function arguments.
For the last builtin, the intrinsic returns
{i64, i32}
, the builtin returns<2 x i64>
. The second return value of the intrinsic is zext'ed.