Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][CUDA] Test cases for bfloat16 math/elem wise joint_matrix #975

Merged
merged 24 commits into from
Jun 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 246 additions & 0 deletions SYCL/BFloat16/bfloat16_builtins.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
// REQUIRES: cuda
//
// Currently this test fails to compile for backends other than cuda.
// Other backends could use this test when bfloat16 math function support is
// added.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
// RUN: %t.out

#include <sycl/sycl.hpp>

#include <cmath>
#include <vector>

using namespace cl::sycl;
using sycl::ext::oneapi::experimental::bfloat16;

constexpr int N = 60; // divisible by all tested array sizes
constexpr float bf16_eps = 0.00390625;

float make_fp32(uint16_t x) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we still need this function in the bfloat16 case? this should use to_float function.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I've switched to the implicit cast to float.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@asudarsa, have we decided to use implicit cast rather than to_float?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bfloat16.to_float(uint16_t raw) as it is currently defined takes a raw storage type, and is a member function of bfloat16 rather than a free function: if there were to be an explicit cast I think there should be a free function added that can take bfloat16 rather than uint16_t.

uint32_t y = x;
y = y << 16;
auto res = reinterpret_cast<float *>(&y);
return *res;
}

bool check(float a, float b) {
return fabs(2 * (a - b) / (a + b)) > bf16_eps * 2;
}

#define TEST_BUILTIN_1_SCAL_IMPL(NAME) \
{ \
buffer<float> a_buf(&a[0], N); \
buffer<int> err_buf(&err, 1); \
q.submit([&](handler &cgh) { \
accessor<float, 1, access::mode::read_write, target::device> A(a_buf, \
cgh); \
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
cgh.parallel_for(N, [=](id<1> index) { \
if (check(NAME(bfloat16{A[index]}), NAME(A[index]))) { \
ERR[0] = 1; \
} \
}); \
}); \
} \
assert(err == 0);

#define TEST_BUILTIN_1_ARR_IMPL(NAME, SZ) \
{ \
buffer<float, 2> a_buf{range<2>{N / SZ, SZ}}; \
buffer<int> err_buf(&err, 1); \
q.submit([&](handler &cgh) { \
accessor<float, 2, access::mode::read_write, target::device> A(a_buf, \
cgh); \
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
cgh.parallel_for(N / SZ, [=](id<1> index) { \
marray<bfloat16, SZ> arg; \
for (int i = 0; i < SZ; i++) { \
arg[i] = A[index][i]; \
} \
marray<bfloat16, SZ> res = NAME(arg); \
for (int i = 0; i < SZ; i++) { \
if (check(res[i], NAME(A[index][i]))) { \
ERR[0] = 1; \
} \
} \
}); \
}); \
} \
assert(err == 0);

#define TEST_BUILTIN_1(NAME) \
TEST_BUILTIN_1_SCAL_IMPL(NAME) \
TEST_BUILTIN_1_ARR_IMPL(NAME, 1) \
TEST_BUILTIN_1_ARR_IMPL(NAME, 2) \
TEST_BUILTIN_1_ARR_IMPL(NAME, 3) \
TEST_BUILTIN_1_ARR_IMPL(NAME, 4) \
TEST_BUILTIN_1_ARR_IMPL(NAME, 5)

#define TEST_BUILTIN_2_SCAL_IMPL(NAME) \
{ \
buffer<float> a_buf(&a[0], N); \
buffer<float> b_buf(&b[0], N); \
buffer<int> err_buf(&err, 1); \
q.submit([&](handler &cgh) { \
accessor<float, 1, access::mode::read_write, target::device> A(a_buf, \
cgh); \
accessor<float, 1, access::mode::read_write, target::device> B(b_buf, \
cgh); \
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
cgh.parallel_for(N, [=](id<1> index) { \
if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]}), \
NAME(A[index], B[index]))) { \
ERR[0] = 1; \
} \
}); \
}); \
} \
assert(err == 0);

#define TEST_BUILTIN_2_ARR_IMPL(NAME, SZ) \
{ \
buffer<float, 2> a_buf{range<2>{N / SZ, SZ}}; \
buffer<float, 2> b_buf{range<2>{N / SZ, SZ}}; \
buffer<int> err_buf(&err, 1); \
q.submit([&](handler &cgh) { \
accessor<float, 2, access::mode::read_write, target::device> A(a_buf, \
cgh); \
accessor<float, 2, access::mode::read_write, target::device> B(b_buf, \
cgh); \
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
cgh.parallel_for(N / SZ, [=](id<1> index) { \
marray<bfloat16, SZ> arg0, arg1; \
for (int i = 0; i < SZ; i++) { \
arg0[i] = A[index][i]; \
arg1[i] = B[index][i]; \
} \
marray<bfloat16, SZ> res = NAME(arg0, arg1); \
for (int i = 0; i < SZ; i++) { \
if (check(res[i], NAME(A[index][i], B[index][i]))) { \
ERR[0] = 1; \
} \
} \
}); \
}); \
} \
assert(err == 0);

#define TEST_BUILTIN_2(NAME) \
TEST_BUILTIN_2_SCAL_IMPL(NAME) \
TEST_BUILTIN_2_ARR_IMPL(NAME, 1) \
TEST_BUILTIN_2_ARR_IMPL(NAME, 2) \
TEST_BUILTIN_2_ARR_IMPL(NAME, 3) \
TEST_BUILTIN_2_ARR_IMPL(NAME, 4) \
TEST_BUILTIN_2_ARR_IMPL(NAME, 5)

#define TEST_BUILTIN_3_SCAL_IMPL(NAME) \
{ \
buffer<float> a_buf(&a[0], N); \
buffer<float> b_buf(&b[0], N); \
buffer<float> c_buf(&c[0], N); \
buffer<int> err_buf(&err, 1); \
q.submit([&](handler &cgh) { \
accessor<float, 1, access::mode::read_write, target::device> A(a_buf, \
cgh); \
accessor<float, 1, access::mode::read_write, target::device> B(b_buf, \
cgh); \
accessor<float, 1, access::mode::read_write, target::device> C(c_buf, \
cgh); \
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
cgh.parallel_for(N, [=](id<1> index) { \
if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]}, \
bfloat16{C[index]}), \
NAME(A[index], B[index], C[index]))) { \
ERR[0] = 1; \
} \
}); \
}); \
} \
assert(err == 0);

#define TEST_BUILTIN_3_ARR_IMPL(NAME, SZ) \
{ \
buffer<float, 2> a_buf{range<2>{N / SZ, SZ}}; \
buffer<float, 2> b_buf{range<2>{N / SZ, SZ}}; \
buffer<float, 2> c_buf{range<2>{N / SZ, SZ}}; \
buffer<int> err_buf(&err, 1); \
q.submit([&](handler &cgh) { \
accessor<float, 2, access::mode::read_write, target::device> A(a_buf, \
cgh); \
accessor<float, 2, access::mode::read_write, target::device> B(b_buf, \
cgh); \
accessor<float, 2, access::mode::read_write, target::device> C(c_buf, \
cgh); \
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
cgh.parallel_for(N / SZ, [=](id<1> index) { \
marray<bfloat16, SZ> arg0, arg1, arg2; \
for (int i = 0; i < SZ; i++) { \
arg0[i] = A[index][i]; \
arg1[i] = B[index][i]; \
arg2[i] = C[index][i]; \
} \
marray<bfloat16, SZ> res = NAME(arg0, arg1, arg2); \
for (int i = 0; i < SZ; i++) { \
if (check(res[i], NAME(A[index][i], B[index][i], C[index][i]))) { \
ERR[0] = 1; \
} \
} \
}); \
}); \
} \
assert(err == 0);

#define TEST_BUILTIN_3(NAME) \
TEST_BUILTIN_3_SCAL_IMPL(NAME) \
TEST_BUILTIN_3_ARR_IMPL(NAME, 1) \
TEST_BUILTIN_3_ARR_IMPL(NAME, 2) \
TEST_BUILTIN_3_ARR_IMPL(NAME, 3) \
TEST_BUILTIN_3_ARR_IMPL(NAME, 4) \
TEST_BUILTIN_3_ARR_IMPL(NAME, 5)

#define TEST_BUILTIN_2_NAN(NAME) \
{ \
buffer<int> err_buf(&err, 1); \
buffer<float> nan_buf(&check_nan, 1); \
q.submit([&](handler &cgh) { \
accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
accessor<float, 1, access::mode::write, target::device> checkNAN( \
nan_buf, cgh); \
cgh.single_task([=]() { \
checkNAN[0] = NAME(bfloat16{NAN}, bfloat16{NAN}); \
if ((NAME(bfloat16{2}, bfloat16{NAN}) != 2) || \
(NAME(bfloat16{NAN}, bfloat16{2}) != 2)) { \
ERR[0] = 1; \
} \
}); \
}); \
} \
assert(err == 0); \
assert(std::isnan(check_nan));

int main() {
queue q;

if (q.get_device().has(aspect::ext_oneapi_bfloat16)) {
std::vector<float> a(N), b(N), c(N);
int err = 0;

for (int i = 0; i < N; i++) {
a[i] = (i - N / 2) / (float)N;
b[i] = (N / 2 - i) / (float)N;
c[i] = (float)(3 * i);
}

TEST_BUILTIN_1(fabs);
TEST_BUILTIN_2(fmin);
TEST_BUILTIN_2(fmax);
TEST_BUILTIN_3(fma);

float check_nan = 0;
TEST_BUILTIN_2_NAN(fmin);
TEST_BUILTIN_2_NAN(fmax);
}
return 0;
}
Loading