[AMDGPU] - Add clang builtins for tied WMMA intrinsics #70669

OutOfCache · 2023-10-30T15:28:30Z

Add clang builtins for the new tied wmma intrinsics.
These variations tie the destination
accumulator matrix to the input
accumulator matrix.

See #69903 for context.

llvmbot · 2023-10-30T15:29:39Z

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-clang-codegen

Author: Jessica Del (OutOfCache)

Changes

Add clang builtins for the new tied wmma intrinsics.
These variations tie the destination
accumulator matrix to the input
accumulator matrix.

See #69903 for context.

Full diff: https://github.com/llvm/llvm-project/pull/70669.diff

4 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+4)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+14)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl (+30)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl (+30)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 532a91fd903e87c..a19c8bd5f219ec6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -292,6 +292,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, "V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, "V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts")
 
@@ -299,6 +301,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", "nc
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, "V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, "V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts")
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index dce5ee5888c458e..d162fdfbbfd8921 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17917,9 +17917,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   }
 
   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
@@ -17957,6 +17961,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       ArgForMatchingRetType = 2;
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
+      ArgForMatchingRetType = 2;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
+      ArgForMatchingRetType = 2;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
+      break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
       ArgForMatchingRetType = 4;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
index 07e4f71f6dbd32a..4f13c75e5e81f98 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -74,6 +74,36 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
   *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a, b, c, true);
 }
 
+//
+// amdgcn_wmma_f16_16x16x16_f16_tied
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
+// CHECK-GFX1100-NEXT:  entry:
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b, v16h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16_tied
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
+// CHECK-GFX1100-NEXT:  entry:
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s b, v16s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a, b, c, true);
+}
+
 //
 // amdgcn_wmma_i32_16x16x16_iu8
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
index 6bbcbec959f3086..4797675f50d42eb 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -76,6 +76,36 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
   *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a, b, c, true);
 }
 
+//
+// amdgcn_wmma_f16_16x16x16_f16_tied
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
+// CHECK-GFX1100-NEXT:  entry:
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16_tied
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
+// CHECK-GFX1100-NEXT:  entry:
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s b, v8s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a, b, c, true);
+}
+
 //
 // amdgcn_wmma_i32_16x16x16_iu8
 //

rampitec

Also needed negative tests that gfx11-insts feature is required (using gfx1030 target for example) and for the immediate arguments. See for example builtins-amdgcn-gfx11-err.cl and builtins-amdgcn-fp-atomics-gfx11-err.cl.

nhaehnle · 2023-11-01T21:13:53Z

(removing myself from this PR)

Add clang builtins for the new tied wmma intrinsics. These variations tie the destination accumulator matrix to the input accumulator matrix. Add negative tests for gfx10, since we do not support the wmma intrinsics before gfx11.

rampitec · 2023-11-09T18:01:56Z

clang/include/clang/Basic/BuiltinsAMDGPU.def

@@ -292,13 +292,17 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")


Need to add negative test for the last operand to always be a constant integer. We do it every time 'I' modifier is used.

I see, thanks for letting me know! I tried to do so now.

rampitec

LGTM

Add clang builtins for the new tied wmma intrinsics. These variations tie the destination accumulator matrix to the input accumulator matrix. See llvm#69903 for context.

OutOfCache requested a review from piotrAMD October 30, 2023 15:28

llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:codegen IR generation bugs: mangling, exceptions, etc. labels Oct 30, 2023

OutOfCache requested a review from nhaehnle October 30, 2023 15:28

OutOfCache mentioned this pull request Oct 30, 2023

[AMDGPU][wmma] - Add tied wmma intrinsic #69903

Merged

OutOfCache requested review from scchan and rampitec October 31, 2023 15:05

rampitec reviewed Oct 31, 2023

View reviewed changes

nhaehnle removed their request for review November 1, 2023 21:13

[AMDGPU] - Add clang builtins for tied WMMA intrinsics

75db77f

Add clang builtins for the new tied wmma intrinsics. These variations tie the destination accumulator matrix to the input accumulator matrix. Add negative tests for gfx10, since we do not support the wmma intrinsics before gfx11.

OutOfCache force-pushed the wmma-pseudo-builtins branch from 66a0e00 to 75db77f Compare November 9, 2023 10:29

OutOfCache requested a review from rampitec November 9, 2023 10:34

rampitec reviewed Nov 9, 2023

View reviewed changes

fixup! [AMDGPU] - Add clang builtins for tied WMMA intrinsics

34ba12a

OutOfCache requested a review from rampitec November 10, 2023 14:50

rampitec approved these changes Nov 10, 2023

View reviewed changes

OutOfCache merged commit b025864 into llvm:main Nov 13, 2023

OutOfCache deleted the wmma-pseudo-builtins branch November 13, 2023 12:23

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] - Add clang builtins for tied WMMA intrinsics #70669

[AMDGPU] - Add clang builtins for tied WMMA intrinsics #70669

Uh oh!

OutOfCache commented Oct 30, 2023

Uh oh!

llvmbot commented Oct 30, 2023 •

edited

Loading

Uh oh!

rampitec left a comment

Uh oh!

nhaehnle commented Nov 1, 2023

Uh oh!

rampitec Nov 9, 2023

Uh oh!

OutOfCache Nov 10, 2023

Uh oh!

rampitec left a comment

Uh oh!

Uh oh!

[AMDGPU] - Add clang builtins for tied WMMA intrinsics #70669

[AMDGPU] - Add clang builtins for tied WMMA intrinsics #70669

Uh oh!

Conversation

OutOfCache commented Oct 30, 2023

Uh oh!

llvmbot commented Oct 30, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

nhaehnle commented Nov 1, 2023

Uh oh!

rampitec Nov 9, 2023

Choose a reason for hiding this comment

Uh oh!

OutOfCache Nov 10, 2023

Choose a reason for hiding this comment

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvmbot commented Oct 30, 2023 •

edited

Loading