Skip to content

[AMDGPU] Allow overload of __builtin_amdgcn_mov_dpp8 #113610

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_bf8, "fUiUif", "nc", "dot11-insts")
//===----------------------------------------------------------------------===//
TARGET_BUILTIN(__builtin_amdgcn_permlane16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts")
TARGET_BUILTIN(__builtin_amdgcn_permlanex16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts")
TARGET_BUILTIN(__builtin_amdgcn_mov_dpp8, "UiUiIUi", "nc", "gfx10-insts")
TARGET_BUILTIN(__builtin_amdgcn_mov_dpp8, "UiUiIUi", "nct", "gfx10-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_ttracedata_imm, "vIs", "n", "gfx10-insts")

//===----------------------------------------------------------------------===//
Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/Sema/SemaAMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ class SemaAMDGPU : public SemaBase {

bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);

bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
unsigned NumDataArgs);

/// Create an AMDGPUWavesPerEUAttr attribute.
AMDGPUFlatWorkGroupSizeAttr *
CreateAMDGPUFlatWorkGroupSizeAttr(const AttributeCommonInfo &CI, Expr *Min,
Expand Down
16 changes: 10 additions & 6 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19115,8 +19115,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return emitBuiltinWithOneOverloadedType<2>(*this, E,
Intrinsic::amdgcn_ds_swizzle);
case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
return emitBuiltinWithOneOverloadedType<2>(*this, E,
Intrinsic::amdgcn_mov_dpp8);
case AMDGPU::BI__builtin_amdgcn_mov_dpp:
case AMDGPU::BI__builtin_amdgcn_update_dpp: {
llvm::SmallVector<llvm::Value *, 6> Args;
Expand All @@ -19130,14 +19128,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
unsigned Size = DataTy->getPrimitiveSizeInBits();
llvm::Type *IntTy =
llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, IntTy);
assert(E->getNumArgs() == 5 || E->getNumArgs() == 6);
bool InsertOld = E->getNumArgs() == 5;
Function *F =
CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
? Intrinsic::amdgcn_mov_dpp8
: Intrinsic::amdgcn_update_dpp,
IntTy);
assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
E->getNumArgs() == 2);
bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
if (InsertOld)
Args.push_back(llvm::PoisonValue::get(IntTy));
for (unsigned I = 0; I != E->getNumArgs(); ++I) {
llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
if (I <= (InsertOld ? 0u : 1u) && Size < 32) {
if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2 : 1) &&
Size < 32) {
if (!DataTy->isIntegerTy())
V = Builder.CreateBitCast(
V, llvm::IntegerType::get(Builder.getContext(), Size));
Expand Down
85 changes: 43 additions & 42 deletions clang/lib/Sema/SemaAMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,49 +63,12 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
OrderIndex = 0;
ScopeIndex = 1;
break;
case AMDGPU::BI__builtin_amdgcn_mov_dpp: {
if (SemaRef.checkArgCountRange(TheCall, 5, 5))
return true;
Expr *ValArg = TheCall->getArg(0);
QualType Ty = ValArg->getType();
// TODO: Vectors can also be supported.
if (!Ty->isArithmeticType() || Ty->isAnyComplexType()) {
SemaRef.Diag(ValArg->getBeginLoc(),
diag::err_typecheck_cond_expect_int_float)
<< Ty << ValArg->getSourceRange();
return true;
}
return false;
}
case AMDGPU::BI__builtin_amdgcn_mov_dpp:
return checkMovDPPFunctionCall(TheCall, 5, 1);
case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
return checkMovDPPFunctionCall(TheCall, 2, 1);
case AMDGPU::BI__builtin_amdgcn_update_dpp: {
if (SemaRef.checkArgCountRange(TheCall, 6, 6))
return true;
Expr *Args[2];
QualType ArgTys[2];
for (unsigned I = 0; I != 2; ++I) {
Args[I] = TheCall->getArg(I);
ArgTys[I] = Args[I]->getType();
// TODO: Vectors can also be supported.
if (!ArgTys[I]->isArithmeticType() || ArgTys[I]->isAnyComplexType()) {
SemaRef.Diag(Args[I]->getBeginLoc(),
diag::err_typecheck_cond_expect_int_float)
<< ArgTys[I] << Args[I]->getSourceRange();
return true;
}
}
if (getASTContext().hasSameUnqualifiedType(ArgTys[0], ArgTys[1]))
return false;
if (((ArgTys[0]->isUnsignedIntegerType() &&
ArgTys[1]->isSignedIntegerType()) ||
(ArgTys[0]->isSignedIntegerType() &&
ArgTys[1]->isUnsignedIntegerType())) &&
getASTContext().getTypeSize(ArgTys[0]) ==
getASTContext().getTypeSize(ArgTys[1]))
return false;
SemaRef.Diag(Args[1]->getBeginLoc(),
diag::err_typecheck_call_different_arg_types)
<< ArgTys[0] << ArgTys[1];
return true;
return checkMovDPPFunctionCall(TheCall, 6, 2);
}
default:
return false;
Expand Down Expand Up @@ -152,6 +115,44 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
return false;
}

bool SemaAMDGPU::checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
unsigned NumDataArgs) {
assert(NumDataArgs <= 2);
if (SemaRef.checkArgCountRange(TheCall, NumArgs, NumArgs))
return true;
Expr *Args[2];
QualType ArgTys[2];
for (unsigned I = 0; I != NumDataArgs; ++I) {
Args[I] = TheCall->getArg(I);
ArgTys[I] = Args[I]->getType();
// TODO: Vectors can also be supported.
if (!ArgTys[I]->isArithmeticType() || ArgTys[I]->isAnyComplexType()) {
SemaRef.Diag(Args[I]->getBeginLoc(),
diag::err_typecheck_cond_expect_int_float)
<< ArgTys[I] << Args[I]->getSourceRange();
return true;
}
}
if (NumDataArgs < 2)
return false;

if (getASTContext().hasSameUnqualifiedType(ArgTys[0], ArgTys[1]))
return false;

if (((ArgTys[0]->isUnsignedIntegerType() &&
ArgTys[1]->isSignedIntegerType()) ||
(ArgTys[0]->isSignedIntegerType() &&
ArgTys[1]->isUnsignedIntegerType())) &&
getASTContext().getTypeSize(ArgTys[0]) ==
getASTContext().getTypeSize(ArgTys[1]))
return false;

SemaRef.Diag(Args[1]->getBeginLoc(),
diag::err_typecheck_call_different_arg_types)
<< ArgTys[0] << ArgTys[1];
return true;
}

static bool
checkAMDGPUFlatWorkGroupSizeArguments(Sema &S, Expr *MinExpr, Expr *MaxExpr,
const AMDGPUFlatWorkGroupSizeAttr &Attr) {
Expand Down
60 changes: 57 additions & 3 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s

#pragma OPENCL EXTENSION cl_khr_fp16 : enable

typedef unsigned int uint;
typedef unsigned long ulong;

Expand All @@ -19,12 +21,64 @@ void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
}

// CHECK-LABEL: @test_mov_dpp8(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1)
void test_mov_dpp8(global uint* out, uint a) {
// CHECK-LABEL: @test_mov_dpp8_uint(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1)
// CHECK-NEXT: store i32 %0,
void test_mov_dpp8_uint(global uint* out, uint a) {
*out = __builtin_amdgcn_mov_dpp8(a, 1);
}

// CHECK-LABEL: @test_mov_dpp8_long(
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %a, i32 1)
// CHECK-NEXT: store i64 %0,
void test_mov_dpp8_long(global long* out, long a) {
*out = __builtin_amdgcn_mov_dpp8(a, 1);
}

// CHECK-LABEL: @test_mov_dpp8_float(
// CHECK: %0 = bitcast float %a to i32
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
// CHECK-NEXT: store i32 %1,
void test_mov_dpp8_float(global float* out, float a) {
*out = __builtin_amdgcn_mov_dpp8(a, 1);
}

// CHECK-LABEL: @test_mov_dpp8_double
// CHECK: %0 = bitcast double %x to i64
// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1)
// CHECK-NEXT: store i64 %1,
void test_mov_dpp8_double(double x, global double *p) {
*p = __builtin_amdgcn_mov_dpp8(x, 1);
}

// CHECK-LABEL: @test_mov_dpp8_short
// CHECK: %0 = zext i16 %x to i32
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
// CHECK-NEXT: %2 = trunc i32 %1 to i16
// CHECK-NEXT: store i16 %2,
void test_mov_dpp8_short(short x, global short *p) {
*p = __builtin_amdgcn_mov_dpp8(x, 1);
}

// CHECK-LABEL: @test_mov_dpp8_char
// CHECK: %0 = zext i8 %x to i32
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
// CHECK-NEXT: %2 = trunc i32 %1 to i8
// CHECK-NEXT: store i8 %2,
void test_mov_dpp8_char(char x, global char *p) {
*p = __builtin_amdgcn_mov_dpp8(x, 1);
}

// CHECK-LABEL: @test_mov_dpp8_half
// CHECK: %0 = load i16,
// CHECK: %1 = zext i16 %0 to i32
// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1)
// CHECK-NEXT: %3 = trunc i32 %2 to i16
// CHECK-NEXT: store i16 %3,
void test_mov_dpp8_half(half *x, global half *p) {
*p = __builtin_amdgcn_mov_dpp8(*x, 1);
}

// CHECK-LABEL: @test_s_memtime
// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memtime()
void test_s_memtime(global ulong* out)
Expand Down
19 changes: 19 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,30 @@
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx900 -verify -S -o - %s
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx908 -verify -S -o - %s

#pragma OPENCL EXTENSION cl_khr_fp16 : enable

typedef unsigned int uint;
typedef int int2 __attribute__((ext_vector_type(2)));

struct S {
int x;
};

void test(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlane16(a, b, c, d, 1, 1); // expected-error {{'__builtin_amdgcn_permlane16' needs target feature gfx10-insts}}
*out = __builtin_amdgcn_permlanex16(a, b, c, d, 1, 1); // expected-error {{'__builtin_amdgcn_permlanex16' needs target feature gfx10-insts}}
*out = __builtin_amdgcn_mov_dpp8(a, 1); // expected-error {{'__builtin_amdgcn_mov_dpp8' needs target feature gfx10-insts}}
}

void test_mov_dpp8(global int* out, int src, int i, int2 i2, struct S s, float _Complex fc)
{
*out = __builtin_amdgcn_mov_dpp8(src, i); // expected-error{{argument to '__builtin_amdgcn_mov_dpp8' must be a constant integer}}
*out = __builtin_amdgcn_mov_dpp8(src, 0.1); // expected-error{{argument to '__builtin_amdgcn_mov_dpp8' must be a constant integer}}
*out = __builtin_amdgcn_mov_dpp8(src); // expected-error{{too few arguments to function call, expected 2, have 1}}
*out = __builtin_amdgcn_mov_dpp8(src, 0, 0); // expected-error{{too many arguments to function call, expected at most 2, have 3}}
*out = __builtin_amdgcn_mov_dpp8(out, 0); // expected-error{{used type '__global int *__private' where integer or floating point type is required}}
*out = __builtin_amdgcn_mov_dpp8("aa", 0); // expected-error{{used type '__constant char[3]' where integer or floating point type is required}}
*out = __builtin_amdgcn_mov_dpp8(i2, 0); // expected-error{{used type '__private int2' (vector of 2 'int' values) where integer or floating point type is required}}
*out = __builtin_amdgcn_mov_dpp8(s, 0); // expected-error{{used type '__private struct S' where integer or floating point type is required}}
*out = __builtin_amdgcn_mov_dpp8(fc, 0); // expected-error{{used type '__private _Complex float' where integer or floating point type is required}}
}
Loading