Skip to content

[AMDGPU] - Add clang builtins for tied WMMA intrinsics #70669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 13, 2023

Conversation

OutOfCache
Copy link
Contributor

Add clang builtins for the new tied wmma intrinsics.
These variations tie the destination
accumulator matrix to the input
accumulator matrix.

See #69903 for context.

@OutOfCache OutOfCache requested a review from piotrAMD October 30, 2023 15:28
@llvmbot llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:codegen IR generation bugs: mangling, exceptions, etc. labels Oct 30, 2023
@OutOfCache OutOfCache requested a review from nhaehnle October 30, 2023 15:28
@llvmbot
Copy link
Member

llvmbot commented Oct 30, 2023

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-clang-codegen

Author: Jessica Del (OutOfCache)

Changes

Add clang builtins for the new tied wmma intrinsics.
These variations tie the destination
accumulator matrix to the input
accumulator matrix.

See #69903 for context.


Full diff: https://github.com/llvm/llvm-project/pull/70669.diff

4 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+4)
  • (modified) clang/lib/CodeGen/CGBuiltin.cpp (+14)
  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl (+30)
  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl (+30)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 532a91fd903e87c..a19c8bd5f219ec6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -292,6 +292,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, "V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, "V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts")
 
@@ -299,6 +301,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", "nc
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, "V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, "V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts")
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index dce5ee5888c458e..d162fdfbbfd8921 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17917,9 +17917,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   }
 
   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
@@ -17957,6 +17961,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       ArgForMatchingRetType = 2;
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
+      ArgForMatchingRetType = 2;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
+      ArgForMatchingRetType = 2;
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
+      break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
       ArgForMatchingRetType = 4;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
index 07e4f71f6dbd32a..4f13c75e5e81f98 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -74,6 +74,36 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
   *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a, b, c, true);
 }
 
+//
+// amdgcn_wmma_f16_16x16x16_f16_tied
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
+// CHECK-GFX1100-NEXT:  entry:
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b, v16h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16_tied
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
+// CHECK-GFX1100-NEXT:  entry:
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1100-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s b, v16s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a, b, c, true);
+}
+
 //
 // amdgcn_wmma_i32_16x16x16_iu8
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
index 6bbcbec959f3086..4797675f50d42eb 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -76,6 +76,36 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
   *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a, b, c, true);
 }
 
+//
+// amdgcn_wmma_f16_16x16x16_f16_tied
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
+// CHECK-GFX1100-NEXT:  entry:
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a, b, c, true);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16_tied
+//
+
+// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
+// CHECK-GFX1100-NEXT:  entry:
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1100-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s b, v8s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a, b, c, true);
+}
+
 //
 // amdgcn_wmma_i32_16x16x16_iu8
 //

Copy link
Collaborator

@rampitec rampitec left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also needed negative tests that gfx11-insts feature is required (using gfx1030 target for example) and for the immediate arguments. See for example builtins-amdgcn-gfx11-err.cl and builtins-amdgcn-fp-atomics-gfx11-err.cl.

@nhaehnle nhaehnle removed their request for review November 1, 2023 21:13
@nhaehnle
Copy link
Collaborator

nhaehnle commented Nov 1, 2023

(removing myself from this PR)

Add clang builtins for the new tied wmma intrinsics.
These variations tie the destination
accumulator matrix to the input
accumulator matrix.

Add negative tests for gfx10, since we do not support
the wmma intrinsics before gfx11.
@OutOfCache OutOfCache force-pushed the wmma-pseudo-builtins branch from 66a0e00 to 75db77f Compare November 9, 2023 10:29
@OutOfCache OutOfCache requested a review from rampitec November 9, 2023 10:34
@@ -292,13 +292,17 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to add negative test for the last operand to always be a constant integer. We do it every time 'I' modifier is used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, thanks for letting me know! I tried to do so now.

@OutOfCache OutOfCache requested a review from rampitec November 10, 2023 14:50
Copy link
Collaborator

@rampitec rampitec left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@OutOfCache OutOfCache merged commit b025864 into llvm:main Nov 13, 2023
@OutOfCache OutOfCache deleted the wmma-pseudo-builtins branch November 13, 2023 12:23
zahiraam pushed a commit to zahiraam/llvm-project that referenced this pull request Nov 20, 2023
Add clang builtins for the new tied wmma intrinsics. 
These variations tie the destination
accumulator matrix to the input
accumulator matrix.

See llvm#69903 for context.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU clang:codegen IR generation bugs: mangling, exceptions, etc. clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants