Skip to content

Commit 75db77f

Browse files
committed
[AMDGPU] - Add clang builtins for tied WMMA intrinsics
Add clang builtins for the new tied wmma intrinsics. These variations tie the destination accumulator matrix to the input accumulator matrix. Add negative tests for gfx10, since we do not support the wmma intrinsics before gfx11.
1 parent 06881d2 commit 75db77f

File tree

6 files changed

+146
-0
lines changed

6 files changed

+146
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,13 +292,17 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc
292292
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts")
293293
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
294294
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
295+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts")
296+
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts")
295297
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, "V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts")
296298
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, "V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts")
297299

298300
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", "nc", "gfx11-insts")
299301
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", "nc", "gfx11-insts")
300302
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts")
301303
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts")
304+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts")
305+
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts")
302306
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, "V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts")
303307
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, "V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts")
304308

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17936,9 +17936,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1793617936
}
1793717937

1793817938
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
17939+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
1793917940
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
17941+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
1794017942
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
17943+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
1794117944
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
17945+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
1794217946
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
1794317947
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
1794417948
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
@@ -17976,6 +17980,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1797617980
ArgForMatchingRetType = 2;
1797717981
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
1797817982
break;
17983+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
17984+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
17985+
ArgForMatchingRetType = 2;
17986+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
17987+
break;
17988+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
17989+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
17990+
ArgForMatchingRetType = 2;
17991+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
17992+
break;
1797917993
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
1798017994
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
1798117995
ArgForMatchingRetType = 4;
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 \
4+
// RUN: -verify -DWMMA_GFX1100_TESTS -S -o - %s
5+
6+
7+
typedef float v4f __attribute__((ext_vector_type(4)));
8+
typedef float v8f __attribute__((ext_vector_type(8)));
9+
typedef half v16h __attribute__((ext_vector_type(16)));
10+
typedef int v2i __attribute__((ext_vector_type(2)));
11+
typedef int v4i __attribute__((ext_vector_type(4)));
12+
typedef int v8i __attribute__((ext_vector_type(8)));
13+
typedef short v16s __attribute__((ext_vector_type(16)));
14+
15+
#ifdef WMMA_GFX1100_TESTS
16+
17+
// Wave32
18+
19+
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f,
20+
global v16h* out16h, v16h a16h, v16h b16h, v16h c16h,
21+
global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
22+
global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)
23+
{
24+
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature gfx11-insts}}
25+
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature gfx11-insts}}
26+
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature gfx11-insts}}
27+
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature gfx11-insts}}
28+
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature gfx11-insts}}
29+
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature gfx11-insts}}
30+
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature gfx11-insts}}
31+
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature gfx11-insts}}
32+
}
33+
34+
#endif

clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,36 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
7474
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a, b, c, true);
7575
}
7676

77+
//
78+
// amdgcn_wmma_f16_16x16x16_f16_tied
79+
//
80+
81+
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
82+
// CHECK-GFX1100-NEXT: entry:
83+
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
84+
// CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
85+
// CHECK-GFX1100-NEXT: ret void
86+
//
87+
void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b, v16h c)
88+
{
89+
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a, b, c, true);
90+
}
91+
92+
//
93+
// amdgcn_wmma_bf16_16x16x16_bf16_tied
94+
//
95+
96+
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
97+
// CHECK-GFX1100-NEXT: entry:
98+
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
99+
// CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
100+
// CHECK-GFX1100-NEXT: ret void
101+
//
102+
void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s b, v16s c)
103+
{
104+
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a, b, c, true);
105+
}
106+
77107
//
78108
// amdgcn_wmma_i32_16x16x16_iu8
79109
//
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 \
4+
// RUN: -verify -DWMMA_GFX1100_TESTS -S -o - %s
5+
6+
7+
typedef float v4f __attribute__((ext_vector_type(4)));
8+
typedef half v8h __attribute__((ext_vector_type(8)));
9+
typedef half v16h __attribute__((ext_vector_type(16)));
10+
typedef int v2i __attribute__((ext_vector_type(2)));
11+
typedef int v4i __attribute__((ext_vector_type(4)));
12+
typedef short v8s __attribute__((ext_vector_type(8)));
13+
typedef short v16s __attribute__((ext_vector_type(16)));
14+
15+
#ifdef WMMA_GFX1100_TESTS
16+
17+
// Wave64
18+
19+
void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out4f, v16h a16h, v16h b16h, v4f c4f,
20+
global v8h* out8h, v16s a16s, v16s b16s, v8h c8h,
21+
global v8s* out8s, v4i a4i, v4i b4i, v8s c8s,
22+
global v4i* out4i, v2i a2i, v2i b2i, v4i c4i)
23+
{
24+
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature gfx11-insts}}
25+
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature gfx11-insts}}
26+
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature gfx11-insts}}
27+
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature gfx11-insts}}
28+
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature gfx11-insts}}
29+
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature gfx11-insts}}
30+
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature gfx11-insts}}
31+
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature gfx11-insts}}
32+
}
33+
34+
#endif

clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,36 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
7676
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a, b, c, true);
7777
}
7878

79+
//
80+
// amdgcn_wmma_f16_16x16x16_f16_tied
81+
//
82+
83+
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
84+
// CHECK-GFX1100-NEXT: entry:
85+
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
86+
// CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
87+
// CHECK-GFX1100-NEXT: ret void
88+
//
89+
void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b, v8h c)
90+
{
91+
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a, b, c, true);
92+
}
93+
94+
//
95+
// amdgcn_wmma_bf16_16x16x16_bf16_tied
96+
//
97+
98+
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
99+
// CHECK-GFX1100-NEXT: entry:
100+
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
101+
// CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
102+
// CHECK-GFX1100-NEXT: ret void
103+
//
104+
void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s b, v8s c)
105+
{
106+
*out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a, b, c, true);
107+
}
108+
79109
//
80110
// amdgcn_wmma_i32_16x16x16_iu8
81111
//

0 commit comments

Comments
 (0)