-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp #112447
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
We need to support 64-bit data types (intrinsics do support it). We are also silently converting FP to integer argument now, also fixed.
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-clang Author: Stanislav Mekhanoshin (rampitec) ChangesWe need to support 64-bit data types (intrinsics do support it). We are also silently converting FP to integer argument now, also fixed. Full diff: https://github.com/llvm/llvm-project/pull/112447.diff 3 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index c02970f55b22e1..e887213aa945e6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -224,8 +224,8 @@ TARGET_BUILTIN(__builtin_amdgcn_frexp_exph, "sh", "nc", "16-bit-insts")
TARGET_BUILTIN(__builtin_amdgcn_fracth, "hh", "nc", "16-bit-insts")
TARGET_BUILTIN(__builtin_amdgcn_classh, "bhi", "nc", "16-bit-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_memrealtime, "WUi", "n", "s-memrealtime")
-TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nc", "dpp")
-TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nc", "dpp")
+TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nct", "dpp")
+TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nct", "dpp")
TARGET_BUILTIN(__builtin_amdgcn_s_dcache_wb, "v", "n", "gfx8-insts")
TARGET_BUILTIN(__builtin_amdgcn_perm, "UiUiUiUi", "nc", "gfx8-insts")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c563f2618b42c8..22e707ca552d28 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19038,15 +19038,27 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ASTContext::GetBuiltinTypeError Error;
getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
assert(Error == ASTContext::GE_None && "Should not codegen an error");
+ llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
+ llvm::Type *IntTy = llvm::IntegerType::get(
+ Builder.getContext(), DataTy->getPrimitiveSizeInBits());
+ Function *F =
+ CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, IntTy);
+ assert(E->getNumArgs() == 5 || E->getNumArgs() == 6);
+ bool InsertOld = E->getNumArgs() == 5;
+ if (InsertOld)
+ Args.push_back(llvm::PoisonValue::get(IntTy));
for (unsigned I = 0; I != E->getNumArgs(); ++I) {
- Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, I, E));
+ llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
+ llvm::Type *ExpTy =
+ F->getFunctionType()->getFunctionParamType(I + InsertOld);
+ if (V->getType() != ExpTy)
+ V = Builder.CreateTruncOrBitCast(V, ExpTy);
+ Args.push_back(V);
}
- assert(Args.size() == 5 || Args.size() == 6);
- if (Args.size() == 5)
- Args.insert(Args.begin(), llvm::PoisonValue::get(Args[0]->getType()));
- Function *F =
- CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
- return Builder.CreateCall(F, Args);
+ llvm::Value *V = Builder.CreateCall(F, Args);
+ if (!DataTy->isIntegerTy())
+ V = Builder.CreateBitCast(V, DataTy);
+ return V;
}
case AMDGPU::BI__builtin_amdgcn_permlane16:
case AMDGPU::BI__builtin_amdgcn_permlanex16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 5bd8f77a5930c4..a2de7bb953d584 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -102,20 +102,66 @@ void test_s_dcache_wb()
__builtin_amdgcn_s_dcache_wb();
}
-// CHECK-LABEL: @test_mov_dpp
+// CHECK-LABEL: @test_mov_dpp_int
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %src, i32 0, i32 0, i32 0, i1 false)
-void test_mov_dpp(global int* out, int src)
+void test_mov_dpp_int(global int* out, int src)
{
*out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false);
}
-// CHECK-LABEL: @test_update_dpp
+// CHECK-LABEL: @test_mov_dpp_long
+// CHECK: %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %0,
+void test_mov_dpp_long(long x, global long *p) {
+ *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_mov_dpp_float
+// CHECK: %0 = bitcast float %x to i32
+// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i32 %1,
+void test_mov_dpp_float(float x, global float *p) {
+ *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_mov_dpp_double
+// CHECK: %0 = bitcast double %x to i64
+// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %1,
+void test_mov_dpp_double(double x, global double *p) {
+ *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_update_dpp_int
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false)
-void test_update_dpp(global int* out, int arg1, int arg2)
+void test_update_dpp_int(global int* out, int arg1, int arg2)
{
*out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
}
+// CHECK-LABEL: @test_update_dpp_long
+// CHECK: %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECk-NEXT: store i64 %0,
+void test_update_dpp_long(long x, global long *p) {
+ *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_update_dpp_float
+// CHECK: %0 = bitcast float %x to i32
+// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i32 %1,
+void test_update_dpp_float(float x, global float *p) {
+ *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_update_dpp_double
+// CHECK: %0 = bitcast double %x to i64
+// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %1,
+void test_update_dpp_double(double x, global double *p) {
+ *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+}
+
// CHECK-LABEL: @test_ds_fadd
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
// CHECK: atomicrmw volatile fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Note, there is also dpp8 with the similar problem. But dpp8 is not properly handled even if intrinsic is used with a 64-bit type (i.e. not split into 2 separate 32-bit dpp ops). This would be a nice to have, but not absolutely necessary like here, because there are no 64-bit real dpp8 operations. A best we can do is to split. On the contrary some of the 64-bit operations are legal with dpp16 and we need to give an usable interface. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This needs some sema type restrictions to make sure it's something sensible
+1 also need sema lit test |
Added. |
I actually wish a better way to have overloaded builtins in clang. I do not believe any user of these builtins is expecting that a wide integer will be silently truncated, and any fp will go through fptosi and backwards after, like we do now. We have much more builtins like that, which are supposed to work on a raw data and do not really take data type into account. Permute, swizzle, you name it. This is what we are doing now, for the:
we are producing:
I do not believe any user expects this. And what's worse a normal user does not know this is happening. |
Yes, C was a mistake |
breaks build of deviceLibs : swdev-493072 Revert 622e398 [AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp (llvm#112447) Change-Id: Ia41446b077f701bed7b0c9aad82cbbfcd3c3e3db
We need to support 64-bit data types (intrinsics do support it). We are also silently converting FP to integer argument now, also fixed. Change-Id: I2a78e8e687884625f88e06da117fe7b8cb20dd01
We need to support 64-bit data types (intrinsics do support it). We are also silently converting FP to integer argument now, also fixed.