Skip to content

[Clang][AMDGPU] Add a builtin for llvm.amdgcn.make.buffer.rsrc intrinsic #95276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/Builtins.def
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
// q -> Scalable vector, followed by the number of elements and the base type.
// Q -> target builtin type, followed by a character to distinguish the builtin type
// Qa -> AArch64 svcount_t builtin type.
// Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type.
// E -> ext_vector, followed by the number of elements and the base type.
// X -> _Complex, followed by the base type.
// Y -> ptrdiff_t
Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc")
BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc")

BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc")

//===----------------------------------------------------------------------===//
// Ballot builtins.
//===----------------------------------------------------------------------===//
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/AST/ASTContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11546,6 +11546,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
Type = Context.SveCountTy;
break;
}
case 'b': {
Type = Context.AMDGPUBufferRsrcTy;
break;
}
default:
llvm_unreachable("Unexpected target builtin type");
}
Expand Down
13 changes: 13 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,17 @@ static Value *emitTernaryBuiltin(CodeGenFunction &CGF,
return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
}

static Value *emitQuaternaryBuiltin(CodeGenFunction &CGF, const CallExpr *E,
unsigned IntrinsicID) {
llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
llvm::Value *Src3 = CGF.EmitScalarExpr(E->getArg(3));

Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
return CGF.Builder.CreateCall(F, {Src0, Src1, Src2, Src3});
}

// Emit an intrinsic that has 1 float or double operand, and 1 integer.
static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
const CallExpr *E,
Expand Down Expand Up @@ -19111,6 +19122,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
return Builder.CreateCall(F, {Arg});
}
case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc:
return emitQuaternaryBuiltin(*this, E, Intrinsic::amdgcn_make_buffer_rsrc);
default:
return nullptr;
}
Expand Down
105 changes: 105 additions & 0 deletions clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu verde -emit-llvm -disable-llvm-optzns -fcuda-is-device -o - %s | FileCheck %s

#define __device__ __attribute__((device))

// CHECK-LABEL: define dso_local ptr addrspace(8) @_Z31test_amdgcn_make_buffer_rsrc_p0Pvsii(
// CHECK-SAME: ptr noundef [[P:%.*]], i16 noundef signext [[STRIDE:%.*]], i32 noundef [[NUM:%.*]], i32 noundef [[FLAGS:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[RETVAL:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[STRIDE_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
// CHECK-NEXT: [[NUM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[FLAGS_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
// CHECK-NEXT: [[STRIDE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STRIDE_ADDR]] to ptr
// CHECK-NEXT: [[NUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NUM_ADDR]] to ptr
// CHECK-NEXT: [[FLAGS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS_ADDR]] to ptr
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i16 [[STRIDE]], ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: store i32 [[NUM]], ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
}

// CHECK-LABEL: define dso_local ptr addrspace(8) @_Z47test_amdgcn_make_buffer_rsrc_p0_stride_constantPvii(
// CHECK-SAME: ptr noundef [[P:%.*]], i32 noundef [[NUM:%.*]], i32 noundef [[FLAGS:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[RETVAL:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[NUM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[FLAGS_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
// CHECK-NEXT: [[NUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NUM_ADDR]] to ptr
// CHECK-NEXT: [[FLAGS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS_ADDR]] to ptr
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[NUM]], ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
return __builtin_amdgcn_make_buffer_rsrc(p, /*stride=*/4, num, flags);
}

// CHECK-LABEL: define dso_local ptr addrspace(8) @_Z44test_amdgcn_make_buffer_rsrc_p0_num_constantPvsi(
// CHECK-SAME: ptr noundef [[P:%.*]], i16 noundef signext [[STRIDE:%.*]], i32 noundef [[FLAGS:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[RETVAL:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[STRIDE_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
// CHECK-NEXT: [[FLAGS_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
// CHECK-NEXT: [[STRIDE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STRIDE_ADDR]] to ptr
// CHECK-NEXT: [[FLAGS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS_ADDR]] to ptr
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i16 [[STRIDE]], ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: store i32 [[FLAGS]], ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
return __builtin_amdgcn_make_buffer_rsrc(p, stride, /*num=*/1234, flags);
}

// CHECK-LABEL: define dso_local ptr addrspace(8) @_Z46test_amdgcn_make_buffer_rsrc_p0_flags_constantPvsi(
// CHECK-SAME: ptr noundef [[P:%.*]], i16 noundef signext [[STRIDE:%.*]], i32 noundef [[NUM:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[RETVAL:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[STRIDE_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
// CHECK-NEXT: [[NUM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
// CHECK-NEXT: [[STRIDE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STRIDE_ADDR]] to ptr
// CHECK-NEXT: [[NUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NUM_ADDR]] to ptr
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i16 [[STRIDE]], ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: store i32 [[NUM]], ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, /*flags=*/5678);
}
93 changes: 93 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -cl-std=CL2.0 -target-cpu verde -emit-llvm -o - %s | FileCheck %s

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
}

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
return __builtin_amdgcn_make_buffer_rsrc(p, /*stride=*/4, num, flags);
}

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
return __builtin_amdgcn_make_buffer_rsrc(p, stride, /*num=*/1234, flags);
}

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, /*flags=*/5678);
}

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
}

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
return __builtin_amdgcn_make_buffer_rsrc(p, /*stride=*/4, num, flags);
}

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
return __builtin_amdgcn_make_buffer_rsrc(p, stride, /*num=*/1234, flags);
}

// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, /*flags=*/5678);
}

// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
return __builtin_amdgcn_make_buffer_rsrc((void *)0LL, stride, num, flags);
}

// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {
return __builtin_amdgcn_make_buffer_rsrc((global void *)0LL, stride, num, flags);
}
Loading