[AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp #112447

rampitec · 2024-10-15T22:26:55Z

We need to support 64-bit data types (intrinsics do support it). We are also silently converting FP to integer argument now, also fixed.

llvmbot · 2024-10-15T22:27:27Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-clang

Author: Stanislav Mekhanoshin (rampitec)

Changes

We need to support 64-bit data types (intrinsics do support it). We are also silently converting FP to integer argument now, also fixed.

Full diff: https://github.com/llvm/llvm-project/pull/112447.diff

3 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+2-2)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+19-7)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl (+50-4)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index c02970f55b22e1..e887213aa945e6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -224,8 +224,8 @@ TARGET_BUILTIN(__builtin_amdgcn_frexp_exph, "sh", "nc", "16-bit-insts")
 TARGET_BUILTIN(__builtin_amdgcn_fracth, "hh", "nc", "16-bit-insts")
 TARGET_BUILTIN(__builtin_amdgcn_classh, "bhi", "nc", "16-bit-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_memrealtime, "WUi", "n", "s-memrealtime")
-TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nc", "dpp")
-TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nc", "dpp")
+TARGET_BUILTIN(__builtin_amdgcn_mov_dpp, "iiIiIiIiIb", "nct", "dpp")
+TARGET_BUILTIN(__builtin_amdgcn_update_dpp, "iiiIiIiIiIb", "nct", "dpp")
 TARGET_BUILTIN(__builtin_amdgcn_s_dcache_wb, "v", "n", "gfx8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_perm, "UiUiUiUi", "nc", "gfx8-insts")
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c563f2618b42c8..22e707ca552d28 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19038,15 +19038,27 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     ASTContext::GetBuiltinTypeError Error;
     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
     assert(Error == ASTContext::GE_None && "Should not codegen an error");
+    llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
+    llvm::Type *IntTy = llvm::IntegerType::get(
+        Builder.getContext(), DataTy->getPrimitiveSizeInBits());
+    Function *F =
+        CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, IntTy);
+    assert(E->getNumArgs() == 5 || E->getNumArgs() == 6);
+    bool InsertOld = E->getNumArgs() == 5;
+    if (InsertOld)
+      Args.push_back(llvm::PoisonValue::get(IntTy));
     for (unsigned I = 0; I != E->getNumArgs(); ++I) {
-      Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, I, E));
+      llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
+      llvm::Type *ExpTy =
+          F->getFunctionType()->getFunctionParamType(I + InsertOld);
+      if (V->getType() != ExpTy)
+        V = Builder.CreateTruncOrBitCast(V, ExpTy);
+      Args.push_back(V);
     }
-    assert(Args.size() == 5 || Args.size() == 6);
-    if (Args.size() == 5)
-      Args.insert(Args.begin(), llvm::PoisonValue::get(Args[0]->getType()));
-    Function *F =
-        CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
-    return Builder.CreateCall(F, Args);
+    llvm::Value *V = Builder.CreateCall(F, Args);
+    if (!DataTy->isIntegerTy())
+      V = Builder.CreateBitCast(V, DataTy);
+    return V;
   }
   case AMDGPU::BI__builtin_amdgcn_permlane16:
   case AMDGPU::BI__builtin_amdgcn_permlanex16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 5bd8f77a5930c4..a2de7bb953d584 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -102,20 +102,66 @@ void test_s_dcache_wb()
   __builtin_amdgcn_s_dcache_wb();
 }
 
-// CHECK-LABEL: @test_mov_dpp
+// CHECK-LABEL: @test_mov_dpp_int
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %src, i32 0, i32 0, i32 0, i1 false)
-void test_mov_dpp(global int* out, int src)
+void test_mov_dpp_int(global int* out, int src)
 {
   *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false);
 }
 
-// CHECK-LABEL: @test_update_dpp
+// CHECK-LABEL: @test_mov_dpp_long
+// CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %0,
+void test_mov_dpp_long(long x, global long *p) {
+  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_mov_dpp_float
+// CHECK:      %0 = bitcast float %x to i32
+// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i32 %1,
+void test_mov_dpp_float(float x, global float *p) {
+  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_mov_dpp_double
+// CHECK:      %0 = bitcast double %x to i64
+// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %1,
+void test_mov_dpp_double(double x, global double *p) {
+  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_update_dpp_int
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false)
-void test_update_dpp(global int* out, int arg1, int arg2)
+void test_update_dpp_int(global int* out, int arg1, int arg2)
 {
   *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
 }
 
+// CHECK-LABEL: @test_update_dpp_long
+// CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECk-NEXT: store i64 %0,
+void test_update_dpp_long(long x, global long *p) {
+  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_update_dpp_float
+// CHECK:      %0 = bitcast float %x to i32
+// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i32 %1,
+void test_update_dpp_float(float x, global float *p) {
+  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+}
+
+// CHECK-LABEL: @test_update_dpp_double
+// CHECK:      %0 = bitcast double %x to i64
+// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %1,
+void test_update_dpp_double(double x, global double *p) {
+  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+}
+
 // CHECK-LABEL: @test_ds_fadd
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 // CHECK: atomicrmw volatile fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}

github-actions · 2024-10-15T22:30:33Z

✅ With the latest revision this PR passed the C/C++ code formatter.

rampitec · 2024-10-15T23:16:10Z

Note, there is also dpp8 with the similar problem. But dpp8 is not properly handled even if intrinsic is used with a 64-bit type (i.e. not split into 2 separate 32-bit dpp ops). This would be a nice to have, but not absolutely necessary like here, because there are no 64-bit real dpp8 operations. A best we can do is to split. On the contrary some of the 64-bit operations are legal with dpp16 and we need to give an usable interface.

arsenm

This needs some sema type restrictions to make sure it's something sensible

clang/include/clang/Basic/BuiltinsAMDGPU.def

yxsamliu · 2024-10-16T15:16:42Z

This needs some sema type restrictions to make sure it's something sensible

+1 also need sema lit test

rampitec · 2024-10-16T19:36:43Z

This needs some sema type restrictions to make sure it's something sensible

Added.

clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx9.cl

rampitec · 2024-10-18T08:30:57Z

I actually wish a better way to have overloaded builtins in clang. I do not believe any user of these builtins is expecting that a wide integer will be silently truncated, and any fp will go through fptosi and backwards after, like we do now. We have much more builtins like that, which are supposed to work on a raw data and do not really take data type into account. Permute, swizzle, you name it.

This is what we are doing now, for the:

void test_mov_dpp_float(float x, global float *p) {
  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
}

we are producing:

  %conv = fptosi float %x to i32
  %0 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %conv, i32 257, i32 15, i32 15, i1 false)
  %conv1 = sitofp i32 %0 to float
  store float %conv1, ptr addrspace(1) %p, align 4, !tbaa !15
  ret void

I do not believe any user expects this. And what's worse a normal user does not know this is happening.

… type

arsenm · 2024-10-21T17:54:49Z

I actually wish a better way to have overloaded builtins in clang. I do not believe any user of these builtins is expecting that a wide integer will be silently truncated, and any fp will go through fptosi and backwards after, like we do now.

Yes, C was a mistake

breaks build of deviceLibs : swdev-493072 Revert 622e398 [AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp (llvm#112447) Change-Id: Ia41446b077f701bed7b0c9aad82cbbfcd3c3e3db

We need to support 64-bit data types (intrinsics do support it). We are also silently converting FP to integer argument now, also fixed. Change-Id: I2a78e8e687884625f88e06da117fe7b8cb20dd01

[AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp

761b3e2

We need to support 64-bit data types (intrinsics do support it). We are also silently converting FP to integer argument now, also fixed.

rampitec requested a review from yxsamliu October 15, 2024 22:26

llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:codegen IR generation bugs: mangling, exceptions, etc. labels Oct 15, 2024

rampitec added 2 commits October 15, 2024 15:40

clang-format

cb9a29e

Simplify casts, builder will not produce a cast to the same type.

9b07ba9

arsenm reviewed Oct 16, 2024

View reviewed changes

jayfoad reviewed Oct 16, 2024

View reviewed changes

clang/include/clang/Basic/BuiltinsAMDGPU.def Show resolved Hide resolved

Added sema checks

27ab5cc

arsenm reviewed Oct 17, 2024

View reviewed changes

clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl Show resolved Hide resolved

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx9.cl Show resolved Hide resolved

rampitec added 4 commits October 17, 2024 12:20

Merge branch 'main' into overloaded-dpp

5485732

Fixed subdword and complex handling, added tests

9cc15d5

clang-format

58512f2

Merge branch 'main' into overloaded-dpp

10d89b6

rampitec added 2 commits October 18, 2024 11:12

Merge branch 'main' into overloaded-dpp

ec3f528

Add sema check that old and data arguments of update dpp are the same…

9d2bc47

… type

arsenm approved these changes Oct 19, 2024

View reviewed changes

Merge branch 'main' into overloaded-dpp

dcb40b4

rampitec merged commit 622e398 into llvm:main Oct 21, 2024
5 of 7 checks passed

rampitec deleted the overloaded-dpp branch October 21, 2024 18:57

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp #112447

[AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp #112447

Uh oh!

rampitec commented Oct 15, 2024

Uh oh!

llvmbot commented Oct 15, 2024 •

edited

Loading

Uh oh!

github-actions bot commented Oct 15, 2024 •

edited

Loading

Uh oh!

rampitec commented Oct 15, 2024

Uh oh!

arsenm left a comment

Uh oh!

Uh oh!

yxsamliu commented Oct 16, 2024

Uh oh!

rampitec commented Oct 16, 2024

Uh oh!

Uh oh!

Uh oh!

rampitec commented Oct 18, 2024 •

edited

Loading

Uh oh!

arsenm commented Oct 21, 2024

Uh oh!

Uh oh!

Uh oh!

[AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp #112447

[AMDGPU] Allow overload of __builtin_amdgcn_mov/update_dpp #112447

Uh oh!

Conversation

rampitec commented Oct 15, 2024

Uh oh!

llvmbot commented Oct 15, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Oct 15, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

rampitec commented Oct 15, 2024

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

yxsamliu commented Oct 16, 2024

Uh oh!

rampitec commented Oct 16, 2024

Uh oh!

Uh oh!

Uh oh!

rampitec commented Oct 18, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Oct 21, 2024

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Oct 15, 2024 •

edited

Loading

github-actions bot commented Oct 15, 2024 •

edited

Loading

rampitec commented Oct 18, 2024 •

edited

Loading