Skip to content

Commit a8d9d50

Browse files
committed
[AMDGPU] gfx90a support
Differential Revision: https://reviews.llvm.org/D96906
1 parent 0252e6e commit a8d9d50

File tree

359 files changed

+76291
-7268
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

359 files changed

+76291
-7268
lines changed

clang/docs/ClangCommandLineReference.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2982,6 +2982,10 @@ Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution mode (AMDGPU only
29822982

29832983
Specify SRAM ECC mode (AMDGPU only)
29842984

2985+
.. option:: -mtgsplit, -mno-tgsplit
2986+
2987+
Enable threadgroup split execution mode (AMDGPU only)
2988+
29852989
.. option:: -mxnack, -mno-xnack
29862990

29872991
Specify XNACK mode (AMDGPU only)

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,5 +258,13 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x2bf16, "V4fV2sV2sV4fIiIiIi", "nc",
258258
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16, "V16fV2sV2sV16fIiIiIi", "nc", "mai-insts")
259259
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8bf16, "V4fV2sV2sV4fIiIiIi", "nc", "mai-insts")
260260

261+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16_1k, "V32fV4sV4sV32fIiIiIi", "nc", "mai-insts")
262+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x4bf16_1k, "V16fV4sV4sV16fIiIiIi", "nc", "mai-insts")
263+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x4bf16_1k, "V4fV4sV4sV4fIiIiIi", "nc", "mai-insts")
264+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x8bf16_1k, "V16fV4sV4sV16fIiIiIi", "nc", "mai-insts")
265+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x16bf16_1k, "V4fV4sV4sV4fIiIiIi", "nc", "mai-insts")
266+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_16x16x4f64, "V4dddV4dIiIiIi", "nc", "mai-insts")
267+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_4x4x4f64, "ddddIiIiIi", "nc", "mai-insts")
268+
261269
#undef BUILTIN
262270
#undef TARGET_BUILTIN

clang/include/clang/Basic/Cuda.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ enum class CudaArch {
7878
GFX906,
7979
GFX908,
8080
GFX909,
81+
GFX90a,
8182
GFX90c,
8283
GFX1010,
8384
GFX1011,

clang/include/clang/Driver/Options.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3094,6 +3094,11 @@ def mcumode : Flag<["-"], "mcumode">, Group<m_amdgpu_Features_Group>,
30943094
HelpText<"Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution mode (AMDGPU only)">;
30953095
def mno_cumode : Flag<["-"], "mno-cumode">, Group<m_amdgpu_Features_Group>;
30963096

3097+
def mtgsplit : Flag<["-"], "mtgsplit">, Group<m_amdgpu_Features_Group>,
3098+
HelpText<"Enable threadgroup split execution mode (AMDGPU only)">;
3099+
def mno_tgsplit : Flag<["-"], "mno-tgsplit">, Group<m_amdgpu_Features_Group>,
3100+
HelpText<"Disable threadgroup split execution mode (AMDGPU only)">;
3101+
30973102
def mwavefrontsize64 : Flag<["-"], "mwavefrontsize64">, Group<m_Group>,
30983103
HelpText<"Specify wavefront size 64 mode (AMDGPU only)">;
30993104
def mno_wavefrontsize64 : Flag<["-"], "mno-wavefrontsize64">, Group<m_Group>,

clang/lib/Basic/Cuda.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ CudaArchToStringMap arch_names[] = {
9898
GFX(906), // gfx906
9999
GFX(908), // gfx908
100100
GFX(909), // gfx909
101+
GFX(90a), // gfx90a
101102
GFX(90c), // gfx90c
102103
GFX(1010), // gfx1010
103104
GFX(1011), // gfx1011

clang/lib/Basic/Targets/AMDGPU.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,9 @@ bool AMDGPUTargetInfo::initFeatureMap(
212212
Features["s-memrealtime"] = true;
213213
Features["s-memtime-inst"] = true;
214214
break;
215+
case GK_GFX90A:
216+
Features["gfx90a-insts"] = true;
217+
LLVM_FALLTHROUGH;
215218
case GK_GFX908:
216219
Features["dot3-insts"] = true;
217220
Features["dot4-insts"] = true;

clang/lib/Basic/Targets/NVPTX.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
202202
case CudaArch::GFX906:
203203
case CudaArch::GFX908:
204204
case CudaArch::GFX909:
205+
case CudaArch::GFX90a:
205206
case CudaArch::GFX90c:
206207
case CudaArch::GFX1010:
207208
case CudaArch::GFX1011:

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4634,6 +4634,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
46344634
case CudaArch::GFX906:
46354635
case CudaArch::GFX908:
46364636
case CudaArch::GFX909:
4637+
case CudaArch::GFX90a:
46374638
case CudaArch::GFX90c:
46384639
case CudaArch::GFX1010:
46394640
case CudaArch::GFX1011:
@@ -4703,6 +4704,7 @@ static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
47034704
case CudaArch::GFX906:
47044705
case CudaArch::GFX908:
47054706
case CudaArch::GFX909:
4707+
case CudaArch::GFX90a:
47064708
case CudaArch::GFX90c:
47074709
case CudaArch::GFX1010:
47084710
case CudaArch::GFX1011:

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX906 %s
2424
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx908 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX908 %s
2525
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s
26+
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s
2627
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s
2728
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s
2829
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s
@@ -52,6 +53,7 @@
5253
// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
5354
// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
5455
// GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
56+
// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
5557
// GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
5658
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
5759
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
Lines changed: 98 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// REQUIRES: amdgpu-registered-target
2-
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -S -emit-llvm -o - %s | FileCheck %s
2+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
34

45
#pragma OPENCL EXTENSION cl_khr_fp64:enable
56

@@ -19,143 +20,199 @@ typedef short v32s __attribute__((ext_vector_type(32)));
1920
typedef double v4d __attribute__((ext_vector_type(4)));
2021

2122

22-
// CHECK-LABEL: @test_mfma_f32_32x32x1f32
23-
// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
23+
#ifdef MFMA_GFX908_TESTS
24+
25+
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x1f32
26+
// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
2427
void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c)
2528
{
2629
*out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0);
2730
}
2831

29-
// CHECK-LABEL: @test_mfma_f32_16x16x1f32
30-
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
32+
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x1f32
33+
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
3134
void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c)
3235
{
3336
*out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0);
3437
}
3538

36-
// CHECK-LABEL: @test_mfma_f32_4x4x1f32
37-
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
39+
// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x1f32
40+
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
3841
void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c)
3942
{
4043
*out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0);
4144
}
4245

43-
// CHECK-LABEL: @test_mfma_f32_32x32x2f32
44-
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
46+
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2f32
47+
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
4548
void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c)
4649
{
4750
*out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0);
4851
}
4952

50-
// CHECK-LABEL: @test_mfma_f32_16x16x4f32
51-
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
53+
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f32
54+
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
5255
void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c)
5356
{
5457
*out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0);
5558
}
5659

57-
// CHECK-LABEL: @test_mfma_f32_32x32x4f16
58-
// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
60+
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4f16
61+
// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
5962
void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c)
6063
{
6164
*out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0);
6265
}
6366

64-
// CHECK-LABEL: @test_mfma_f32_16x16x4f16
65-
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
67+
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f16
68+
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
6669
void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c)
6770
{
6871
*out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0);
6972
}
7073

71-
// CHECK-LABEL: @test_mfma_f32_4x4x4f16
72-
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
74+
// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x4f16
75+
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
7376
void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c)
7477
{
7578
*out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0);
7679
}
7780

78-
// CHECK-LABEL: @test_mfma_f32_32x32x8f16
79-
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
81+
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x8f16
82+
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
8083
void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c)
8184
{
8285
*out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0);
8386
}
8487

85-
// CHECK-LABEL: @test_mfma_f32_16x16x16f16
86-
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
88+
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x16f16
89+
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
8790
void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c)
8891
{
8992
*out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0);
9093
}
9194

92-
// CHECK-LABEL: @test_mfma_i32_32x32x4i8
93-
// CHECK: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
95+
// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x4i8
96+
// CHECK-GFX908: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
9497
void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c)
9598
{
9699
*out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0);
97100
}
98101

99-
// CHECK-LABEL: @test_mfma_i32_16x16x4i8
100-
// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
102+
// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x4i8
103+
// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
101104
void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c)
102105
{
103106
*out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0);
104107
}
105108

106-
// CHECK-LABEL: @test_mfma_i32_4x4x4i8
107-
// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
109+
// CHECK-GFX908-LABEL: @test_mfma_i32_4x4x4i8
110+
// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
108111
void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c)
109112
{
110113
*out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0);
111114
}
112115

113-
// CHECK-LABEL: @test_mfma_i32_32x32x8i8
114-
// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
116+
// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x8i8
117+
// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
115118
void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c)
116119
{
117120
*out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0);
118121
}
119122

120-
// CHECK-LABEL: @test_mfma_i32_16x16x16i8
121-
// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
123+
// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x16i8
124+
// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
122125
void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
123126
{
124127
*out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0);
125128
}
126129

127-
// CHECK-LABEL: @test_mfma_f32_32x32x2bf16
128-
// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
130+
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16
131+
// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
129132
void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
130133
{
131134
*out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
132135
}
133136

134-
// CHECK-LABEL: @test_mfma_f32_16x16x2bf16
135-
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
137+
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16
138+
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
136139
void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
137140
{
138141
*out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
139142
}
140143

141-
// CHECK-LABEL: @test_mfma_f32_4x4x2bf16
142-
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
144+
// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16
145+
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
143146
void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
144147
{
145148
*out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
146149
}
147150

148-
// CHECK-LABEL: @test_mfma_f32_32x32x4bf16
149-
// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
151+
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16
152+
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
150153
void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
151154
{
152155
*out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
153156
}
154157

155-
// CHECK-LABEL: @test_mfma_f32_16x16x8bf16
156-
// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
158+
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16
159+
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
157160
void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
158161
{
159162
*out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
160163
}
161164

165+
#endif // MFMA_GFX908_TESTS
166+
167+
#ifdef MFMA_GFX90A_TESTS
168+
169+
// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k
170+
// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
171+
void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c)
172+
{
173+
*out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0);
174+
}
175+
176+
// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k
177+
// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
178+
void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
179+
{
180+
*out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0);
181+
}
182+
183+
// CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k
184+
// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
185+
void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
186+
{
187+
*out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0);
188+
}
189+
190+
// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k
191+
// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
192+
void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
193+
{
194+
*out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0);
195+
}
196+
197+
// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k
198+
// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
199+
void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
200+
{
201+
*out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0);
202+
}
203+
204+
// CHECK-GFX90A-LABEL: @test_mfma_f64_16x16x4f64
205+
// CHECK-GFX90A: call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %c, i32 0, i32 0, i32 0)
206+
void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c)
207+
{
208+
*out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0);
209+
}
210+
211+
// CHECK-GFX90A-LABEL: @test_mfma_f64_4x4x4f64
212+
// CHECK-GFX90A: call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %c, i32 0, i32 0, i32 0)
213+
void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
214+
{
215+
*out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, 0);
216+
}
217+
218+
#endif // MFMA_GFX90A_TESTS

clang/test/Driver/amdgpu-features.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
// RUN: %clang -### -target amdgcn-amdhsa -mcpu=gfx908:sramecc- %s 2>&1 | FileCheck --check-prefix=NO-SRAM-ECC %s
2323
// NO-SRAM-ECC: "-target-feature" "-sramecc"
2424

25+
// RUN: %clang -### -target amdgcn -mcpu=gfx90A -mtgsplit %s 2>&1 | FileCheck --check-prefix=TGSPLIT %s
26+
// RUN: %clang -### -target amdgcn -mcpu=gfx90A -mno-tgsplit %s 2>&1 | FileCheck --check-prefix=NO-TGSPLIT %s
27+
// TGSPLIT: "-target-feature" "+tgsplit"
28+
// NO-TGSPLIT: "-target-feature" "-tgsplit"
29+
2530
// RUN: %clang -### -target amdgcn-amdpal -mcpu=gfx1010 -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
2631
// RUN: %clang -### -target amdgcn-amdpal -mcpu=gfx1010 -mno-wavefrontsize64 -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s
2732
// WAVE64: "-target-feature" "+wavefrontsize64"

clang/test/Driver/amdgpu-macros.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@
105105
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx906
106106
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx908
107107
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx909
108+
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90a
108109
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90c
109110
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010
110111
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1011

0 commit comments

Comments
 (0)