[libclc] Move sqrt to CLC library

frasercrmck · frasercrmck · commit da9ddedd2d21 · 2025-02-25T17:47:20.000Z
This is fairly straightforward for most targets.

AMDGPU provides its own implementation of sqrt for double types. This
commit moves this into the implementation of CLC sqrt. It uses weak
linkage on the 'default' CLC sqrt to allow AMDGPU to only override the
builtin for the types it cares about.

Since we don't yet have CLC ldexp, and AMDGPU prefers the builtin
anyway, it also uses __builtin_ldexp.

There are no changes to the codegen for any AMDGPU target. There is some
minor code movement on NVIDIA targets.
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
@@ -28,6 +28,7 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
   spirv/lib/SOURCES;
   # CLC internal libraries
   clc/lib/generic/SOURCES;
+  clc/lib/amdgpu/SOURCES;
   clc/lib/clspv/SOURCES;
   clc/lib/spirv/SOURCES;
 )
diff --git a/libclc/amdgpu/lib/SOURCES b/libclc/amdgpu/lib/SOURCES
@@ -10,4 +10,3 @@ math/half_log2.cl
 math/half_recip.cl
 math/half_rsqrt.cl
 math/half_sqrt.cl
-math/sqrt.cl
diff --git a/libclc/clc/include/clc/float/definitions.h b/libclc/clc/include/clc/float/definitions.h
@@ -1,7 +1,6 @@
 #define MAXFLOAT 0x1.fffffep127f
 #define HUGE_VALF __builtin_huge_valf()
 #define INFINITY __builtin_inff()
-#define NAN __builtin_nanf("")
 
 #define FLT_DIG 6
 #define FLT_MANT_DIG 24
@@ -13,6 +12,7 @@
 #define FLT_MAX MAXFLOAT
 #define FLT_MIN 0x1.0p-126f
 #define FLT_EPSILON 0x1.0p-23f
+#define FLT_NAN __builtin_nanf("")
 
 #define FP_ILOGB0 (-2147483647 - 1)
 #define FP_ILOGBNAN 2147483647
@@ -46,6 +46,7 @@
 #define DBL_MAX 0x1.fffffffffffffp1023
 #define DBL_MIN 0x1.0p-1022
 #define DBL_EPSILON 0x1.0p-52
+#define DBL_NAN __builtin_nan("")
 
 #define M_E 0x1.5bf0a8b145769p+1
 #define M_LOG2E 0x1.71547652b82fep+0
@@ -80,6 +81,7 @@
 #define HALF_MAX 0x1.ffcp15h
 #define HALF_MIN 0x1.0p-14h
 #define HALF_EPSILON 0x1.0p-10h
+#define HALF_NAN __builtin_nanf16("")
 
 #define M_LOG2E_H 0x1.714p+0h
 
diff --git a/libclc/clc/include/clc/math/clc_sqrt.h b/libclc/clc/include/clc/math/clc_sqrt.h
@@ -1,8 +1,12 @@
-#include <clc/clcfunc.h>
-#include <clc/clctypes.h>
+#ifndef __CLC_MATH_CLC_SQRT_H__
+#define __CLC_MATH_CLC_SQRT_H__
 
-#define __CLC_FUNCTION __clc_sqrt
 #define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_sqrt
+
 #include <clc/math/gentype.inc>
+
 #undef __CLC_BODY
 #undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_SQRT_H__
diff --git a/libclc/clc/lib/amdgpu/SOURCES b/libclc/clc/lib/amdgpu/SOURCES
@@ -0,0 +1 @@
+math/clc_sqrt_fp64.cl
diff --git a/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl b/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl
@@ -20,52 +20,42 @@
  * THE SOFTWARE.
  */
 
-#include "math/clc_sqrt.h"
-#include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float)
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-_CLC_DEFINE_UNARY_BUILTIN(half, sqrt, __clc_sqrt, half)
-
-#endif
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fma.h>
 
 #ifdef cl_khr_fp64
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 #ifdef __AMDGCN__
-  #define __clc_builtin_rsq __builtin_amdgcn_rsq
+#define __clc_builtin_rsq __builtin_amdgcn_rsq
 #else
-  #define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee
+#define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee
 #endif
 
-_CLC_OVERLOAD _CLC_DEF double sqrt(double x) {
-
+_CLC_OVERLOAD _CLC_DEF double __clc_sqrt(double x) {
   uint vcc = x < 0x1p-767;
   uint exp0 = vcc ? 0x100 : 0;
   unsigned exp1 = vcc ? 0xffffff80 : 0;
 
-  double v01 = ldexp(x, exp0);
+  double v01 = __builtin_ldexp(x, exp0);
   double v23 = __clc_builtin_rsq(v01);
   double v45 = v01 * v23;
   v23 = v23 * 0.5;
 
-  double v67 = fma(-v23, v45, 0.5);
-  v45 = fma(v45, v67, v45);
-  double v89 = fma(-v45, v45, v01);
-  v23 = fma(v23, v67, v23);
-  v45 = fma(v89, v23, v45);
-  v67 = fma(-v45, v45, v01);
-  v23 = fma(v67, v23, v45);
+  double v67 = __clc_fma(-v23, v45, 0.5);
+  v45 = __clc_fma(v45, v67, v45);
+  double v89 = __clc_fma(-v45, v45, v01);
+  v23 = __clc_fma(v23, v67, v23);
+  v45 = __clc_fma(v89, v23, v45);
+  v67 = __clc_fma(-v45, v45, v01);
+  v23 = __clc_fma(v67, v23, v45);
 
-  v23 = ldexp(v23, exp1);
-  return ((x == __builtin_inf()) || (x == 0.0)) ? v01 : v23;
+  v23 = __builtin_ldexp(v23, exp1);
+  return (x == __builtin_inf() || (x == 0.0)) ? v01 : v23;
 }
 
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sqrt, double);
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_sqrt, double);
 
 #endif
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
@@ -32,6 +32,7 @@ math/clc_nan.cl
 math/clc_nextafter.cl
 math/clc_rint.cl
 math/clc_round.cl
+math/clc_sqrt.cl
 math/clc_sw_fma.cl
 math/clc_trunc.cl
 relational/clc_all.cl
diff --git a/libclc/clc/lib/generic/math/clc_sqrt.cl b/libclc/clc/lib/generic/math/clc_sqrt.cl
@@ -20,14 +20,8 @@
  * THE SOFTWARE.
  */
 
-#include <clc/clc.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
 
-// Map the llvm sqrt intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_llvm_intr_sqrt
-#define __CLC_INTRINSIC "llvm.sqrt"
-#include <clc/math/unary_intrin.inc>
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
-
-#define __CLC_BODY <clc_sqrt_impl.inc>
+#define __CLC_BODY <clc_sqrt.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_sqrt.inc b/libclc/clc/lib/generic/math/clc_sqrt.inc
@@ -21,19 +21,17 @@
  */
 
 #if __CLC_FPSIZE == 64
-#define __CLC_NAN __builtin_nan("")
-#define ZERO 0.0
+#define __CLC_NAN DBL_NAN
 #elif __CLC_FPSIZE == 32
-#define __CLC_NAN NAN
-#define ZERO 0.0f
+#define __CLC_NAN FLT_NAN
 #elif __CLC_FPSIZE == 16
-#define __CLC_NAN (half)NAN
-#define ZERO 0.0h
+#define __CLC_NAN HALF_NAN
 #endif
 
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sqrt(__CLC_GENTYPE val) {
-  return val < ZERO ? __CLC_NAN : __clc_llvm_intr_sqrt(val);
+__attribute__((weak)) _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
+__clc_sqrt(__CLC_GENTYPE val) {
+  return val < __CLC_FP_LIT(0.0) ? (__CLC_GENTYPE)__CLC_NAN
+                                 : __builtin_elementwise_sqrt(val);
 }
 
 #undef __CLC_NAN
-#undef ZERO
diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES
@@ -180,7 +180,6 @@ math/sincos.cl
 math/sincos_helpers.cl
 math/sinh.cl
 math/sinpi.cl
-math/clc_sqrt.cl
 math/sqrt.cl
 math/clc_tan.cl
 math/tan.cl
diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl
@@ -27,6 +27,7 @@
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
+#include <clc/math/clc_sqrt.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_clamp.h>
 #include <math/clc_hypot.h>
@@ -49,7 +50,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
   float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
   float fx = as_float(ux) * fi_exp;
   float fy = as_float(uy) * fi_exp;
-  retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
+  retval = __clc_sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
 
   retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
   retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32
@@ -81,7 +82,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) {
   double ay = y * preadjust;
 
   // The post adjust may overflow, but this can't be avoided in any case
-  double r = sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
+  double r = __clc_sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
 
   // If the difference in exponents between x and y is large
   double s = x + y;
diff --git a/libclc/generic/lib/math/sqrt.cl b/libclc/generic/lib/math/sqrt.cl
@@ -21,7 +21,9 @@
  */
 
 #include <clc/clc.h>
-#include "math/clc_sqrt.h"
+#include <clc/math/clc_sqrt.h>
 
-#define __CLC_FUNCTION sqrt
-#include <clc/math/unary_builtin.inc>
+#define FUNCTION sqrt
+#define __CLC_BODY <clc/shared/unary_def.inc>
+
+#include <clc/math/gentype.inc>

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS`
`28`	`28`	`spirv/lib/SOURCES;`
`29`	`29`	`# CLC internal libraries`
`30`	`30`	`clc/lib/generic/SOURCES;`
	`31`	`+ clc/lib/amdgpu/SOURCES;`
`31`	`32`	`clc/lib/clspv/SOURCES;`
`32`	`33`	`clc/lib/spirv/SOURCES;`
`33`	`34`	`)`