@@ -734,8 +734,8 @@ std::enable_if_t<__FAST_MATH_GENFLOAT(T), T> sin(T x) __NOEXC {
734
734
735
735
// svgenfloat sincos (svgenfloat x, genfloatptr cosval)
736
736
template <typename T, typename T2>
737
- std::enable_if_t <
738
- detail::is_svgenfloat<T>::value && detail::is_genfloatptr<T2>::value, T>
737
+ std::enable_if_t <__FAST_MATH_GENFLOAT(T) && detail::is_genfloatptr<T2>::value,
738
+ T>
739
739
sincos (T x, T2 cosval) __NOEXC {
740
740
detail::check_vector_size<T, T2>();
741
741
return __sycl_std::__invoke_sincos<T>(x, cosval);
@@ -2500,6 +2500,23 @@ std::enable_if_t<detail::is_svgenfloatf<T>::value, T> cos(T x) __NOEXC {
2500
2500
return native::cos (x);
2501
2501
}
2502
2502
2503
+ // svgenfloat sincos (svgenfloat x, genfloatptr cosval)
2504
+ // This is a performance optimization to ensure that sincos isn't slower than a
2505
+ // pair of sin/cos executed separately. Theoretically, calling non-native sincos
2506
+ // might be faster than calling native::sin plus native::cos separately and we'd
2507
+ // need some kind of cost model to make the right decision (and move this
2508
+ // entirely to the JIT/AOT compilers). However, in practice, this simpler
2509
+ // solution seems to work just fine and matches how sin/cos above are optimized
2510
+ // for the fast math path.
2511
+ template <typename T, typename T2>
2512
+ std::enable_if_t <
2513
+ detail::is_svgenfloatf<T>::value && detail::is_genfloatptr<T2>::value, T>
2514
+ sincos (T x, T2 cosval) __NOEXC {
2515
+ detail::check_vector_size<T, T2>();
2516
+ *cosval = native::cos (x);
2517
+ return native::sin (x);
2518
+ }
2519
+
2503
2520
// svgenfloatf exp (svgenfloatf x)
2504
2521
template <typename T>
2505
2522
std::enable_if_t <detail::is_svgenfloatf<T>::value, T> exp (T x) __NOEXC {
0 commit comments