Skip to content

Commit 5c8b7e7

Browse files
authored
[SYCL] Add -fsycl-fp32-prec-sqrt flag (#5309)
This flag enables correctly rounded `sycl::sqrt` (the default precision requirement is 3 ULP). And enables the flag for CUDA and HIP targets.
1 parent 008519a commit 5c8b7e7

File tree

17 files changed

+130
-24
lines changed

17 files changed

+130
-24
lines changed

clang/include/clang/Basic/CodeGenOptions.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ CODEGENOPT(NoImplicitFloat , 1, 0) ///< Set when -mno-implicit-float is enable
182182
CODEGENOPT(NullPointerIsValid , 1, 0) ///< Assume Null pointer deference is defined.
183183
CODEGENOPT(OpenCLCorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt
184184
CODEGENOPT(HIPCorrectlyRoundedDivSqrt, 1, 1) ///< -fno-hip-fp32-correctly-rounded-divide-sqrt
185+
CODEGENOPT(SYCLFp32PrecSqrt, 1, 0) ///< -fsycl-fp32-prec-sqrt
185186
CODEGENOPT(UniqueInternalLinkageNames, 1, 0) ///< Internal Linkage symbols get unique names.
186187
CODEGENOPT(SplitMachineFunctions, 1, 0) ///< Split machine functions using profile information.
187188

clang/include/clang/Driver/Options.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4750,6 +4750,9 @@ def fno_sycl_device_lib_EQ : CommaJoined<["-"], "fno-sycl-device-lib=">, Group<s
47504750
Values<"libc, libm-fp32, libm-fp64, all">, HelpText<"Control exclusion of "
47514751
"device libraries from device binary linkage. Valid arguments "
47524752
"are libc, libm-fp32, libm-fp64, all">;
4753+
def fsycl_fp32_prec_sqrt : Flag<["-"], "fsycl-fp32-prec-sqrt">, Group<sycl_Group>, Flags<[CC1Option]>,
4754+
HelpText<"SYCL only. Specify that single precision floating-point sqrt is correctly rounded.">,
4755+
MarshallingInfoFlag<CodeGenOpts<"SYCLFp32PrecSqrt">>;
47534756

47544757
//===----------------------------------------------------------------------===//
47554758
// FLangOption + CoreOption + NoXarchOption

clang/include/clang/Driver/ToolChain.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,8 @@ class ToolChain {
708708

709709
/// Get paths of HIP device libraries.
710710
virtual llvm::SmallVector<BitCodeLibraryInfo, 12>
711-
getHIPDeviceLibs(const llvm::opt::ArgList &Args) const;
711+
getHIPDeviceLibs(const llvm::opt::ArgList &Args,
712+
const Action::OffloadKind DeviceOffloadingKind) const;
712713

713714
/// Return sanitizers which are available in this toolchain.
714715
virtual SanitizerMask getSupportedSanitizers() const;

clang/lib/Driver/ToolChain.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1119,7 +1119,9 @@ void ToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
11191119
ArgStringList &CC1Args) const {}
11201120

11211121
llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
1122-
ToolChain::getHIPDeviceLibs(const ArgList &DriverArgs) const {
1122+
ToolChain::getHIPDeviceLibs(
1123+
const ArgList &DriverArgs,
1124+
const Action::OffloadKind DeviceOffloadingKind) const {
11231125
return {};
11241126
}
11251127

clang/lib/Driver/ToolChains/AMDGPU.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -895,9 +895,9 @@ bool AMDGPUToolChain::shouldSkipArgument(const llvm::opt::Arg *A) const {
895895
return false;
896896
}
897897

898-
llvm::SmallVector<std::string, 12>
899-
ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
900-
const std::string &GPUArch) const {
898+
llvm::SmallVector<std::string, 12> ROCMToolChain::getCommonDeviceLibNames(
899+
const llvm::opt::ArgList &DriverArgs, const std::string &GPUArch,
900+
const Action::OffloadKind DeviceOffloadingKind) const {
901901
auto Kind = llvm::AMDGPU::parseArchAMDGCN(GPUArch);
902902
const StringRef CanonArch = llvm::AMDGPU::getArchNameAMDGCN(Kind);
903903

@@ -920,9 +920,15 @@ ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
920920
options::OPT_fno_unsafe_math_optimizations, false);
921921
bool FastRelaxedMath = DriverArgs.hasFlag(options::OPT_ffast_math,
922922
options::OPT_fno_fast_math, false);
923-
bool CorrectSqrt = DriverArgs.hasFlag(
924-
options::OPT_fhip_fp32_correctly_rounded_divide_sqrt,
925-
options::OPT_fno_hip_fp32_correctly_rounded_divide_sqrt);
923+
bool CorrectSqrt = false;
924+
if (DeviceOffloadingKind == Action::OFK_SYCL) {
925+
// When using SYCL, sqrt is only correctly rounded if the flag is specified
926+
CorrectSqrt = DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt);
927+
} else
928+
CorrectSqrt = DriverArgs.hasFlag(
929+
options::OPT_fhip_fp32_correctly_rounded_divide_sqrt,
930+
options::OPT_fno_hip_fp32_correctly_rounded_divide_sqrt);
931+
926932
bool Wave64 = isWave64(DriverArgs, Kind);
927933

928934
return RocmInstallation.getCommonBitcodeLibs(

clang/lib/Driver/ToolChains/AMDGPU.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ class LLVM_LIBRARY_VISIBILITY ROCMToolChain : public AMDGPUToolChain {
142142
// Returns a list of device library names shared by different languages
143143
llvm::SmallVector<std::string, 12>
144144
getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
145-
const std::string &GPUArch) const;
145+
const std::string &GPUArch,
146+
const Action::OffloadKind DeviceOffloadingKind) const;
146147
};
147148

148149
} // end namespace toolchains

clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ const char *AMDGCN::OpenMPLinker::constructLLVMLinkCommand(
124124
// - write an opt pass that sets that on every function it sees and pipe
125125
// the device-libs bitcode through that on the way to this llvm-link
126126
SmallVector<std::string, 12> BCLibs =
127-
AMDGPUOpenMPTC.getCommonDeviceLibNames(Args, SubArchName.str());
127+
AMDGPUOpenMPTC.getCommonDeviceLibNames(Args, SubArchName.str(),
128+
Action::OFK_OpenMP);
128129
llvm::for_each(BCLibs, [&](StringRef BCFile) {
129130
CmdArgs.push_back(Args.MakeArgString(BCFile));
130131
});

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,10 @@ void CudaToolChain::addClangTargetOptions(
710710
if (DeviceOffloadingKind == Action::OFK_SYCL) {
711711
toolchains::SYCLToolChain::AddSYCLIncludeArgs(getDriver(), DriverArgs,
712712
CC1Args);
713+
714+
if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt)) {
715+
CC1Args.push_back("-fcuda-prec-sqrt");
716+
}
713717
}
714718

715719
auto NoLibSpirv = DriverArgs.hasArg(options::OPT_fno_sycl_libspirv,

clang/lib/Driver/ToolChains/HIPAMD.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -260,11 +260,12 @@ void HIPAMDToolChain::addClangTargetOptions(
260260
CC1Args.push_back(DriverArgs.MakeArgString(LibSpirvFile));
261261
}
262262

263-
llvm::for_each(getHIPDeviceLibs(DriverArgs), [&](auto BCFile) {
264-
CC1Args.push_back(BCFile.ShouldInternalize ? "-mlink-builtin-bitcode"
265-
: "-mlink-bitcode-file");
266-
CC1Args.push_back(DriverArgs.MakeArgString(BCFile.Path));
267-
});
263+
llvm::for_each(
264+
getHIPDeviceLibs(DriverArgs, DeviceOffloadingKind), [&](auto BCFile) {
265+
CC1Args.push_back(BCFile.ShouldInternalize ? "-mlink-builtin-bitcode"
266+
: "-mlink-bitcode-file");
267+
CC1Args.push_back(DriverArgs.MakeArgString(BCFile.Path));
268+
});
268269
}
269270

270271
llvm::opt::DerivedArgList *
@@ -359,7 +360,9 @@ VersionTuple HIPAMDToolChain::computeMSVCVersion(const Driver *D,
359360
}
360361

361362
llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
362-
HIPAMDToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
363+
HIPAMDToolChain::getHIPDeviceLibs(
364+
const llvm::opt::ArgList &DriverArgs,
365+
const Action::OffloadKind DeviceOffloadingKind) const {
363366
llvm::SmallVector<BitCodeLibraryInfo, 12> BCLibs;
364367
if (DriverArgs.hasArg(options::OPT_nogpulib))
365368
return {};
@@ -416,7 +419,8 @@ HIPAMDToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
416419
BCLibs.push_back(RocmInstallation.getHIPPath());
417420

418421
// Add common device libraries like ocml etc.
419-
for (auto N : getCommonDeviceLibNames(DriverArgs, GpuArch.str()))
422+
for (auto N : getCommonDeviceLibNames(DriverArgs, GpuArch.str(),
423+
DeviceOffloadingKind))
420424
BCLibs.push_back(StringRef(N));
421425

422426
// Add instrument lib.

clang/lib/Driver/ToolChains/HIPAMD.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,9 @@ class LLVM_LIBRARY_VISIBILITY HIPAMDToolChain final : public ROCMToolChain {
8686
llvm::opt::ArgStringList &CC1Args) const override;
8787
void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
8888
llvm::opt::ArgStringList &CC1Args) const override;
89-
llvm::SmallVector<BitCodeLibraryInfo, 12>
90-
getHIPDeviceLibs(const llvm::opt::ArgList &Args) const override;
89+
llvm::SmallVector<BitCodeLibraryInfo, 12> getHIPDeviceLibs(
90+
const llvm::opt::ArgList &Args,
91+
const Action::OffloadKind DeviceOffloadingKind) const override;
9192

9293
SanitizerMask getSupportedSanitizers() const override;
9394

clang/lib/Driver/ToolChains/HIPSPV.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ void HIPSPVToolChain::addClangTargetOptions(
154154
CC1Args.append(
155155
{"-fvisibility", "hidden", "-fapply-global-visibility-to-externs"});
156156

157-
llvm::for_each(getHIPDeviceLibs(DriverArgs),
157+
llvm::for_each(getHIPDeviceLibs(DriverArgs, DeviceOffloadingKind),
158158
[&](const BitCodeLibraryInfo &BCFile) {
159159
CC1Args.append({"-mlink-builtin-bitcode",
160160
DriverArgs.MakeArgString(BCFile.Path)});
@@ -206,7 +206,9 @@ void HIPSPVToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
206206
}
207207

208208
llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
209-
HIPSPVToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
209+
HIPSPVToolChain::getHIPDeviceLibs(
210+
const llvm::opt::ArgList &DriverArgs,
211+
const Action::OffloadKind DeviceOffloadingKind) const {
210212
llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12> BCLibs;
211213
if (DriverArgs.hasArg(options::OPT_nogpulib))
212214
return {};

clang/lib/Driver/ToolChains/HIPSPV.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,9 @@ class LLVM_LIBRARY_VISIBILITY HIPSPVToolChain final : public ToolChain {
6868
llvm::opt::ArgStringList &CC1Args) const override;
6969
void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
7070
llvm::opt::ArgStringList &CC1Args) const override;
71-
llvm::SmallVector<BitCodeLibraryInfo, 12>
72-
getHIPDeviceLibs(const llvm::opt::ArgList &Args) const override;
71+
llvm::SmallVector<BitCodeLibraryInfo, 12> getHIPDeviceLibs(
72+
const llvm::opt::ArgList &Args,
73+
const Action::OffloadKind DeviceOffloadingKind) const override;
7374

7475
SanitizerMask getSupportedSanitizers() const override;
7576

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// REQUIRES: clang-driver
2+
// REQUIRES: amdgpu-registered-target
3+
// REQUIRES: !system-windows
4+
5+
// RUN: %clang -### \
6+
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa \
7+
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
8+
// RUN: -fsycl-fp32-prec-sqrt \
9+
// RUN: --rocm-path=%S/Inputs/rocm \
10+
// RUN: %s \
11+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s
12+
13+
// CHECK-CORRECT: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc"
14+
15+
// RUN: %clang -### \
16+
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa \
17+
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
18+
// RUN: --rocm-path=%S/Inputs/rocm \
19+
// RUN: %s \
20+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-APPROX %s
21+
22+
// CHECK-APPROX: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc"
23+
24+
// RUN: %clang -### \
25+
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa \
26+
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
27+
// RUN: -fsycl-fp32-prec-sqrt -fno-hip-fp32-correctly-rounded-divide-sqrt \
28+
// RUN: --rocm-path=%S/Inputs/rocm \
29+
// RUN: %s \
30+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CONFLICT %s
31+
32+
// CHECK-CONFLICT: warning: argument unused during compilation: '-fno-hip-fp32-correctly-rounded-divide-sqrt'
33+
// CHECK-CONFLICT: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc"
34+
35+
void func(){};
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// REQUIRES: clang-driver
2+
3+
// RUN: %clang -### -fsycl \
4+
// RUN: -fsycl-fp32-prec-sqrt %s 2>&1 | FileCheck %s
5+
6+
// RUN: %clang -### -fsycl -fsycl-targets=spir64_gen \
7+
// RUN: -fsycl-fp32-prec-sqrt %s 2>&1 | FileCheck %s
8+
//
9+
// RUN: %clang -### -fsycl -fsycl-targets=spir64_x86_64 \
10+
// RUN: -fsycl-fp32-prec-sqrt %s 2>&1 | FileCheck %s
11+
//
12+
// RUN: %clang -### -fsycl -fsycl-targets=spir64_fpga \
13+
// RUN: -fsycl-fp32-prec-sqrt %s 2>&1 | FileCheck %s
14+
15+
// CHECK: warning: argument unused during compilation: '-fsycl-fp32-prec-sqrt'
16+
17+
void func(){};

clang/test/Driver/sycl-nvptx-sqrt.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// REQUIRES: clang-driver
2+
// REQUIRES: nvptx-registered-target
3+
4+
// RUN: %clang -### \
5+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda \
6+
// RUN: -fsycl-fp32-prec-sqrt \
7+
// RUN: %s \
8+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s
9+
10+
// CHECK-CORRECT: "-fcuda-prec-sqrt"
11+
12+
// RUN: %clang -### \
13+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda \
14+
// RUN: %s \
15+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-APPROX %s
16+
17+
// CHECK-APPROX-NOT: "-fcuda-prec-sqrt"
18+
19+
void func(){};

sycl/doc/GetStartedGuide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,7 @@ which contains all the symbols required.
818818
project and may cause compilation issues on some platforms
819819
* `sycl::sqrt` is not correctly rounded by default as the SYCL specification
820820
allows lower precision, when porting from CUDA it may be helpful to use
821-
`-Xclang -fcuda-prec-sqrt` to use the correctly rounded square root, this is
821+
`-fsycl-fp32-prec-sqrt` to use the correctly rounded square root, this is
822822
significantly slower but matches the default precision used by `nvcc`, and
823823
this `clang++` flag is equivalent to the `nvcc` `-prec-sqrt` flag, except that
824824
it defaults to `false`.

sycl/doc/UsersManual.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,14 @@ and not recommended to use in production environment.
257257
options (e.g. -c, -E, -S) may interfere with the expected output set during
258258
the host compilation. Doing so is considered undefined behavior.
259259

260+
**`-fsycl-fp32-prec-sqrt`**
261+
262+
Enable use of correctly rounded `sycl::sqrt` function as defined by IEE754.
263+
Without this flag, the default precision requirement for `sycl::sqrt` is 3
264+
ULP.
265+
266+
NOTE: This flag is currently only supported with the CUDA and HIP targets.
267+
260268
# Example: SYCL device code compilation
261269

262270
To invoke SYCL device compiler set `-fsycl-device-only` flag.

0 commit comments

Comments
 (0)