Add device code check for conversion builtin

Hugh Delaney · Hugh Delaney · commit 618c80750930 · 2022-03-31T16:12:49.000+01:00
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -609,17 +609,19 @@ float float_to_tf32(float a) {
   int32_t tmp_int = __nvvm_f2tf32_rna(a);
   return __nvvm_bitcast_i2f(tmp_int);
 #else
-  throw runtime_error("When using SYCL_EXT_ONEAPI_MATRIX=3 float_to_tf32 is "
-                      "only supported by CUDA devices",
-                      PI_INVALID_DEVICE);
+  uint32_t tmp_uint = reinterpret_cast<uint32_t &>(a);
+  tmp_uint += 0x1000u;
+  float ret = reinterpret_cast<float &>(tmp_uint);
+  return ret;
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 }
 
 // This function just zeros out the bottom 13 bits of the tf32 type
 float tf32_to_float(float a) {
   uint32_t tmp_uint = reinterpret_cast<uint32_t &>(a);
   tmp_uint &= 0xFFFFE000u;
-  return reinterpret_cast<float &>(tmp_uint);
+  float ret = reinterpret_cast<float &>(tmp_uint);
+  return ret;
 }
 
 } // namespace experimental::matrix
diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
@@ -77,6 +77,7 @@ int main() {
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_accC, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
 
+          // CHECK: tail call i32 @llvm.nvvm.f2tf32.rna(float {{.*}}
           // Round a, b to tf32
           for (auto i = 0; i < 4; ++i)
             sub_a.data[i] = float_to_tf32(sub_a.data[i]);
@@ -120,14 +121,15 @@ int main() {
           joint_matrix_load(sg, sub_b, accB.get_pointer(), N);
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, i32 {{.*}}) #{{.*}}
           joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
-          
+
+          // CHECK: tail call i32 @llvm.nvvm.f2tf32.rna(float {{.*}}
           // Round a, b to tf32
           for (auto i = 0; i < 4; ++i)
             sub_a.data[i] = float_to_tf32(sub_a.data[i]);
 
           for (auto i = 0; i < 4; ++i)
             sub_b.data[i] = float_to_tf32(sub_b.data[i]);
-          
+
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.col.col.tf32(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}) #{{.*}}
           sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
           //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16) #{{.*}}