Skip to content

Commit 970a2df

Browse files
authored
[SYCL][CUDA] Change builtin selection for SYCL (#9768)
Libdevice for NVPTX was previously working due to the fact that LLVM intrinsics were not selected due to the CUDA toolchain having isMathErrnoDefault evaluate to `true`. Since LLVM intrinsics were not selected then the symbols could be found when linking with libdevice, which gives special backend specific definitions of CXX stdlib funcs. Using errno to prevent intrinsic selection was a hack and it gets undone by using `-ffast-math`, meaning libdevice CXX funcs were not working with `-ffast-math`. This change instead explicitly says not to use LLVM intrinsics if compiling SYCL for NVPTX backend. This means that `-ffast-math` behaviour should now be fixed for CXX stdlib funcs. @jchlanda
1 parent a307ff1 commit 970a2df

File tree

4 files changed

+158
-6
lines changed

4 files changed

+158
-6
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2437,9 +2437,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
24372437
getContext().BuiltinInfo.isConstWithoutErrnoAndExceptions(BuiltinID);
24382438
bool ConstWithoutExceptions =
24392439
getContext().BuiltinInfo.isConstWithoutExceptions(BuiltinID);
2440-
if (FD->hasAttr<ConstAttr>() ||
2441-
((ConstWithoutErrnoAndExceptions || ConstWithoutExceptions) &&
2442-
(!ConstWithoutErrnoAndExceptions || (!getLangOpts().MathErrno)))) {
2440+
if ((FD->hasAttr<ConstAttr>() ||
2441+
((ConstWithoutErrnoAndExceptions || ConstWithoutExceptions) &&
2442+
(!ConstWithoutErrnoAndExceptions || (!getLangOpts().MathErrno)))) &&
2443+
!(getLangOpts().SYCLIsDevice && getTarget().getTriple().isNVPTX())) {
24432444
switch (BuiltinIDIfNoAsmLabel) {
24442445
case Builtin::BIceil:
24452446
case Builtin::BIceilf:

clang/lib/Driver/ToolChains/Cuda.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,6 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {
229229
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
230230
const llvm::fltSemantics *FPType = nullptr) const override;
231231

232-
// math-errno should be the default for SYCL but not other OFK using CUDA TC
233-
bool IsMathErrnoDefault() const override { return OK == Action::OFK_SYCL; }
234-
235232
void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
236233
llvm::opt::ArgStringList &CC1Args) const override;
237234

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// SYCL compilation uses libdevice in order to implement platform specific
2+
// versions of funcs like cosf, logf, etc. In order for the libdevice funcs
3+
// to be used, we need to make sure that llvm intrinsics such as llvm.cos.f32
4+
// are not emitted since many backends do not have lowerings for such
5+
// intrinsics. This allows the driver to link in the libdevice definitions for
6+
// cosf etc. later in the driver flow.
7+
8+
// RUN: %clang_cc1 %s -fsycl-is-device -triple nvptx64-nvidia-cuda -emit-llvm -o - | FileCheck %s
9+
// RUN: %clang_cc1 %s -fsycl-is-device -triple nvptx64-nvidia-cuda -ffast-math -emit-llvm -o - | FileCheck %s
10+
11+
#include "Inputs/sycl.hpp"
12+
13+
extern "C" {
14+
float scalbnf(float x, int n);
15+
float logf(float x);
16+
float expf(float x);
17+
float frexpf(float x, int *exp);
18+
float ldexpf(float x, int exp);
19+
float log10f(float x);
20+
float modff(float x, float *intpart);
21+
float exp2f(float x);
22+
float expm1f(float x);
23+
int ilogbf(float x);
24+
float log1pf(float x);
25+
float log2f(float x);
26+
float logbf(float x);
27+
float sqrtf(float x);
28+
float cbrtf(float x);
29+
float hypotf(float x, float y);
30+
float erff(float x);
31+
float erfcf(float x);
32+
float tgammaf(float x);
33+
float lgammaf(float x);
34+
float fmodf(float x, float y);
35+
float remainderf(float x, float y);
36+
float remquof(float x, float y, int *q);
37+
float nextafterf(float x, float y);
38+
float fdimf(float x, float y);
39+
float fmaf(float x, float y, float z);
40+
float sinf(float x);
41+
float cosf(float x);
42+
float tanf(float x);
43+
float powf(float x, float y);
44+
float acosf(float x);
45+
float asinf(float x);
46+
float atanf(float x);
47+
float atan2f(float x, float y);
48+
float coshf(float x);
49+
float sinhf(float x);
50+
float tanhf(float x);
51+
float acoshf(float x);
52+
float asinhf(float x);
53+
float atanhf(float x);
54+
};
55+
56+
// CHECK-NOT: llvm.abs.
57+
// CHECK-NOT: llvm.scalbnf.
58+
// CHECK-NOT: llvm.log.
59+
// CHECK-NOT: llvm.exp.
60+
// CHECK-NOT: llvm.frexp.
61+
// CHECK-NOT: llvm.ldexp.
62+
// CHECK-NOT: llvm.log10.
63+
// CHECK-NOT: llvm.mod.
64+
// CHECK-NOT: llvm.exp2.
65+
// CHECK-NOT: llvm.expm1.
66+
// CHECK-NOT: llvm.ilogb.
67+
// CHECK-NOT: llvm.log1p.
68+
// CHECK-NOT: llvm.log2.
69+
// CHECK-NOT: llvm.logb.
70+
// CHECK-NOT: llvm.sqrt.
71+
// CHECK-NOT: llvm.cbrt.
72+
// CHECK-NOT: llvm.hypot.
73+
// CHECK-NOT: llvm.erf.
74+
// CHECK-NOT: llvm.erfc.
75+
// CHECK-NOT: llvm.tgamma.
76+
// CHECK-NOT: llvm.lgamma.
77+
// CHECK-NOT: llvm.fmod.
78+
// CHECK-NOT: llvm.remainder.
79+
// CHECK-NOT: llvm.remquo.
80+
// CHECK-NOT: llvm.nextafter.
81+
// CHECK-NOT: llvm.fdim.
82+
// CHECK-NOT: llvm.fma.
83+
// CHECK-NOT: llvm.sin.
84+
// CHECK-NOT: llvm.cos.
85+
// CHECK-NOT: llvm.tan.
86+
// CHECK-NOT: llvm.pow.
87+
// CHECK-NOT: llvm.acos.
88+
// CHECK-NOT: llvm.asin.
89+
// CHECK-NOT: llvm.atan.
90+
// CHECK-NOT: llvm.atan2.
91+
// CHECK-NOT: llvm.cosh.
92+
// CHECK-NOT: llvm.sinh.
93+
// CHECK-NOT: llvm.tanh.
94+
// CHECK-NOT: llvm.acosh.
95+
// CHECK-NOT: llvm.asinh.
96+
// CHECK-NOT: llvm.atanh.
97+
void sycl_kernel(float *a, int *b) {
98+
sycl::queue{}.submit([&](sycl::handler &cgh) {
99+
cgh.single_task<class kernel>([=]() {
100+
a[0] = scalbnf(a[0], b[0]);
101+
a[0] = logf(a[0]);
102+
a[0] = expf(a[0]);
103+
a[0] = frexpf(a[0], b);
104+
a[0] = ldexpf(a[0], b[0]);
105+
a[0] = log10f(a[0]);
106+
a[0] = modff(a[0], a);
107+
a[0] = exp2f(a[0]);
108+
a[0] = expm1f(a[0]);
109+
a[0] = ilogbf(a[0]);
110+
a[0] = log1pf(a[0]);
111+
a[0] = log2f(a[0]);
112+
a[0] = logbf(a[0]);
113+
a[0] = sqrtf(a[0]);
114+
a[0] = cbrtf(a[0]);
115+
a[0] = hypotf(a[0], a[0]);
116+
a[0] = erff(a[0]);
117+
a[0] = erfcf(a[0]);
118+
a[0] = tgammaf(a[0]);
119+
a[0] = lgammaf(a[0]);
120+
a[0] = fmodf(a[0], a[0]);
121+
a[0] = remainderf(a[0], a[0]);
122+
a[0] = remquof(a[0], a[0], b);
123+
a[0] = nextafterf(a[0], a[0]);
124+
a[0] = fdimf(a[0], a[0]);
125+
a[0] = fmaf(a[0], a[0], a[0]);
126+
a[0] = sinf(a[0]);
127+
a[0] = cosf(a[0]);
128+
a[0] = tanf(a[0]);
129+
a[0] = powf(a[0], a[0]);
130+
a[0] = acosf(a[0]);
131+
a[0] = asinf(a[0]);
132+
a[0] = atanf(a[0]);
133+
a[0] = atan2f(a[0], a[0]);
134+
a[0] = coshf(a[0]);
135+
a[0] = sinhf(a[0]);
136+
a[0] = tanhf(a[0]);
137+
a[0] = acoshf(a[0]);
138+
a[0] = asinhf(a[0]);
139+
a[0] = atanhf(a[0]);
140+
});
141+
});
142+
}

sycl/test-e2e/DeviceLib/cmath_test.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66

77
// RUN: %{build} -fno-builtin -fsycl-device-lib-jit-link %{mathflags} -o %t.out
88
// RUN: %if !gpu %{ %{run} %t.out %}
9+
//
10+
// // Check that --fast-math works with cmath funcs for CUDA
11+
// RUN: %if cuda %{ %{build} -fno-builtin %{mathflags} -o %t.out -ffast-math -DSYCL_E2E_FASTMATH %}
12+
// RUN: %if cuda %{ %{run} %t.out %}
913

1014
#include "math_utils.hpp"
1115
#include <cmath>
@@ -92,6 +96,9 @@ template <class T> void device_cmath_test_1(s::queue &deviceQueue) {
9296

9397
res_access[i++] = !(std::signbit(infinity) == 0);
9498
res_access[i++] = !(std::signbit(minus_infinity) != 0);
99+
100+
#ifndef SYCL_E2E_FASTMATH
101+
// -ffast-math is not guaranteed to correctly detect nan etc.
95102
res_access[i++] = !(std::isunordered(minus_nan, nan) != 0);
96103
res_access[i++] = !(std::isunordered(minus_infinity, infinity) == 0);
97104
res_access[i++] = !(std::isgreater(minus_infinity, infinity) == 0);
@@ -113,6 +120,11 @@ template <class T> void device_cmath_test_1(s::queue &deviceQueue) {
113120
res_access[i++] = !(std::isnormal(minus_infinity) == 0);
114121
res_access[i++] = !(std::isnormal(subnormal) == 0);
115122
res_access[i++] = !(std::isnormal(1.0f) != 0);
123+
#else
124+
for (; i < static_cast<int>(TEST_NUM);) {
125+
res_access[i++] = 0;
126+
}
127+
#endif
116128
});
117129
});
118130
}

0 commit comments

Comments
 (0)