Skip to content

Commit bc7f967

Browse files
committed
[SYCL][CUDA] Add -fcuda-prec-sqrt flag
This patch add `__nvvm_reflect` support for `__CUDA_PREC_SQRT` and adds a `-Xclang -fcuda-prec-sqrt` flag which is equivalent to the `nvcc` `-prec-sqrt` flag, except that it defaults to `false` for `clang++` and to `true` for `nvcc`. The reason for that is that the SYCL specification doesn't require a correctly rounded `sqrt` so we likely want to keep the fast `sqrt` as a default and use the flag when higher precision is required. See additional discussion on intel#4041 and intel#5116
1 parent a95c12a commit bc7f967

File tree

5 files changed

+22
-1
lines changed

5 files changed

+22
-1
lines changed

clang/include/clang/Basic/TargetOptions.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ class TargetOptions {
7575
/// address space.
7676
bool NVPTXUseShortPointers = false;
7777

78+
/// \brief If enabled, use precise square root
79+
bool NVVMCudaPrecSqrt = false;
80+
7881
/// \brief If enabled, allow AMDGPU unsafe floating point atomics.
7982
bool AllowAMDGPUUnsafeFPAtomics = false;
8083

clang/include/clang/Driver/Options.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -980,6 +980,11 @@ defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
980980
TargetOpts<"NVPTXUseShortPointers">, DefaultFalse,
981981
PosFlag<SetTrue, [CC1Option], "Use 32-bit pointers for accessing const/local/shared address spaces">,
982982
NegFlag<SetFalse>>;
983+
defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
984+
TargetOpts<"NVVMCudaPrecSqrt">, DefaultFalse,
985+
PosFlag<SetTrue, [CC1Option], "Specify">,
986+
NegFlag<SetFalse, [], "Don't specify">,
987+
BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>;
983988
def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
984989
HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
985990
def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -758,13 +758,15 @@ void CodeGenModule::Release() {
758758
llvm::MDString::get(Ctx, CodeGenOpts.MemoryProfileOutput));
759759
}
760760

761-
if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) {
761+
if ((LangOpts.CUDAIsDevice || LangOpts.isSYCL()) && getTriple().isNVPTX()) {
762762
// Indicate whether __nvvm_reflect should be configured to flush denormal
763763
// floating point values to 0. (This corresponds to its "__CUDA_FTZ"
764764
// property.)
765765
getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
766766
CodeGenOpts.FP32DenormalMode.Output !=
767767
llvm::DenormalMode::IEEE);
768+
getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt",
769+
getTarget().getTargetOpts().NVVMCudaPrecSqrt);
768770
}
769771

770772
if (LangOpts.EHAsynch)

llvm/lib/Target/NVPTX/NVVMReflect.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,11 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
170170
ReflectVal = Flag->getSExtValue();
171171
} else if (ReflectArg == "__CUDA_ARCH") {
172172
ReflectVal = SmVersion * 10;
173+
} else if (ReflectArg == "__CUDA_PREC_SQRT") {
174+
// Try to pull __CUDA_PREC_SQRT from the nvvm-reflect-prec-sqrt module flag.
175+
if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
176+
F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt")))
177+
ReflectVal = Flag->getSExtValue();
173178
}
174179
Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
175180
ToRemove.push_back(Call);

sycl/doc/GetStartedGuide.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,12 @@ which contains all the symbols required.
819819
GPU (SM 71), but it should work on any GPU compatible with SM 50 or above
820820
* The NVIDIA OpenCL headers conflict with the OpenCL headers required for this
821821
project and may cause compilation issues on some platforms
822+
* `sycl::sqrt` is not correctly rounded by default as the SYCL specification
823+
allows lower precision, when porting from CUDA it may be helpful to use
824+
`-Xclang -fcuda-prec-sqrt` to use the correctly rounded square root, this is
825+
significantly slower but matches the default precision used by `nvcc`, and
826+
this `clang++` flag is equivalent to the `nvcc` `-prec-sqrt` flag, except that
827+
it defaults to `false`.
822828
823829
### HIP back-end limitations
824830

0 commit comments

Comments
 (0)