Skip to content

Commit 0bcabae

Browse files
authored
[SYCL][HIP] Update amd joint_matrix tests to reflect changes to joint_matrix_mad API. (#13250)
- The` joint_matrix_mad` API has been modified to accept the output as an argument to the function. This pull request updates the relevant tests to accommodate this change for amd gpu. - Minor update to check joint_matrix parameters in compile time.
1 parent 75afc83 commit 0bcabae

File tree

5 files changed

+20
-43
lines changed

5 files changed

+20
-43
lines changed
Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
// REQUIRES: hip
2-
32
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o compile-query-hip
43

54
#include <iostream>
@@ -14,21 +13,16 @@ int main() {
1413
using myparams = matrix_params<architecture::amd_gpu_gfx90a, int8_t, int8_t,
1514
int32_t, int32_t, 32, 32, 8>;
1615

17-
size_t dmsize = myparams::M;
18-
size_t dnsize = myparams::N;
19-
size_t dksize = myparams::K;
20-
std::cout
21-
<< "sizes of AMD gpu gfx90a matrix_params chosen by the user are: M "
22-
<< dmsize << " N " << dnsize << " K " << dksize << std::endl;
16+
static_assert(myparams::M == 32);
17+
static_assert(myparams::N == 32);
18+
static_assert(myparams::K == 8);
2319

2420
// Sizes-only compile-time query: types are given, generate default sizes
2521
using myparams2 = matrix_params<architecture::amd_gpu_gfx90a, int8_t, int8_t,
2622
int32_t, int32_t>;
27-
myparams2 p;
28-
dmsize = myparams2::M;
29-
dnsize = myparams2::N;
30-
dksize = myparams2::K;
31-
std::cout << "default AMD gpu gfx90a sizes matrix_params are: M " << dmsize
32-
<< " N " << dnsize << " K " << dksize << std::endl;
23+
static_assert(myparams2::M == 16);
24+
static_assert(myparams2::N == 16);
25+
static_assert(myparams2::K == 4);
26+
3327
return 0;
3428
};

sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
// REQUIRES: hip
2-
// XFAIL: hip
3-
42
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
53

64
#include <sycl/sycl.hpp>
@@ -10,12 +8,10 @@ using namespace sycl::ext::oneapi::experimental::matrix;
108
using sycl::ext::oneapi::bfloat16;
119

1210
int main() {
13-
1411
buffer<bfloat16, 1> bufA(nullptr, range<1>(1));
1512
buffer<bfloat16, 1> bufB(nullptr, range<1>(1));
1613
buffer<float, 1> bufC(nullptr, range<1>(1));
1714
buffer<float, 1> bufD(nullptr, range<1>(1));
18-
1915
queue q;
2016

2117
q.submit([&](handler &cgh) {
@@ -42,9 +38,8 @@ int main() {
4238
sub_a{};
4339
joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major>
4440
sub_b{};
45-
46-
// CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %{{.*}}, <4 x i16> %{{.*}} <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
47-
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
41+
// CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
42+
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
4843
joint_matrix_store(
4944
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
5045
16, layout::row_major);
@@ -61,8 +56,8 @@ int main() {
6156
joint_matrix<sub_group, bfloat16, use::b, 8, 32, layout::col_major>
6257
sub_b{};
6358

64-
// CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> {{.*}}, <4 x i16> {{.*}}, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
65-
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
59+
// CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
60+
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
6661
joint_matrix_store(
6762
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
6863
32, layout::row_major);

sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
// REQUIRES: hip
2-
// XFAIL: hip
3-
42
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
53

64
#include <sycl/sycl.hpp>
@@ -9,12 +7,10 @@ using namespace sycl;
97
using namespace sycl::ext::oneapi::experimental::matrix;
108

119
int main() {
12-
1310
buffer<double, 1> bufA(nullptr, range<1>(1));
1411
buffer<double, 1> bufB(nullptr, range<1>(1));
1512
buffer<double, 1> bufC(nullptr, range<1>(1));
1613
buffer<double, 1> bufD(nullptr, range<1>(1));
17-
1814
queue q;
1915

2016
q.submit([&](handler &cgh) {
@@ -42,8 +38,8 @@ int main() {
4238
joint_matrix<sub_group, double, use::b, 4, 16, layout::row_major>
4339
sub_b{};
4440

45-
// CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %{{.*}}, double %{{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
46-
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
41+
// CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double {{.*}}, double {{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
42+
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
4743
joint_matrix_store(
4844
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
4945
16, layout::row_major);

sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
// REQUIRES: hip
2-
// XFAIL: hip
3-
42
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
53

64
#include <sycl/sycl.hpp>
@@ -9,12 +7,10 @@ using namespace sycl;
97
using namespace sycl::ext::oneapi::experimental::matrix;
108

119
int main() {
12-
1310
buffer<half, 1> bufA(nullptr, range<1>(1));
1411
buffer<half, 1> bufB(nullptr, range<1>(1));
1512
buffer<float, 1> bufC(nullptr, range<1>(1));
1613
buffer<float, 1> bufD(nullptr, range<1>(1));
17-
1814
queue q;
1915

2016
q.submit([&](handler &cgh) {
@@ -42,8 +38,8 @@ int main() {
4238
joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major>
4339
sub_b{};
4440

45-
// CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
46-
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
41+
// CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
42+
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
4743
joint_matrix_store(
4844
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
4945
16, layout::row_major);
@@ -60,8 +56,8 @@ int main() {
6056
joint_matrix<sub_group, half, use::b, 8, 32, layout::col_major>
6157
sub_b{};
6258

63-
// CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> {{.*}}, <4 x half> {{.*}}, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
64-
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
59+
// CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
60+
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
6561
joint_matrix_store(
6662
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
6763
32, layout::row_major);

sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
// REQUIRES: hip
2-
// XFAIL: hip
3-
42
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
53

64
#include <sycl/sycl.hpp>
@@ -9,12 +7,10 @@ using namespace sycl;
97
using namespace sycl::ext::oneapi::experimental::matrix;
108

119
int main() {
12-
1310
buffer<int8_t, 1> bufA(nullptr, range<1>(1));
1411
buffer<int8_t, 1> bufB(nullptr, range<1>(1));
1512
buffer<int32_t, 1> bufC(nullptr, range<1>(1));
1613
buffer<int32_t, 1> bufD(nullptr, range<1>(1));
17-
1814
queue q;
1915

2016
q.submit([&](handler &cgh) {
@@ -42,8 +38,8 @@ int main() {
4238
joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::row_major>
4339
sub_b{};
4440

45-
// CHECK: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
46-
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
41+
// CHECK: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 {{.*}}, i32 {{.*}}, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
42+
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
4743
joint_matrix_store(
4844
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
4945
16, layout::row_major);
@@ -61,7 +57,7 @@ int main() {
6157
sub_b{};
6258

6359
// CHECK: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 {{.*}}, i32 {{.*}}, <16 x i32> zeroinitializer, i32 0, i32 0, i32 0)
64-
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
60+
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
6561
joint_matrix_store(
6662
sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
6763
32, layout::row_major);

0 commit comments

Comments
 (0)