Skip to content

[SYCL][CUDA] Add -fcuda-prec-sqrt flag #5141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/TargetOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ class TargetOptions {
/// address space.
bool NVPTXUseShortPointers = false;

/// \brief If enabled, use precise square root
bool NVVMCudaPrecSqrt = false;

/// \brief If enabled, allow AMDGPU unsafe floating point atomics.
bool AllowAMDGPUUnsafeFPAtomics = false;

Expand Down
5 changes: 5 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -980,6 +980,11 @@ defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
TargetOpts<"NVPTXUseShortPointers">, DefaultFalse,
PosFlag<SetTrue, [CC1Option], "Use 32-bit pointers for accessing const/local/shared address spaces">,
NegFlag<SetFalse>>;
defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
TargetOpts<"NVVMCudaPrecSqrt">, DefaultFalse,
PosFlag<SetTrue, [CC1Option], "Specify">,
NegFlag<SetFalse, [], "Don't specify">,
BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>;
def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,
Expand Down
4 changes: 3 additions & 1 deletion clang/lib/CodeGen/CodeGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -758,13 +758,15 @@ void CodeGenModule::Release() {
llvm::MDString::get(Ctx, CodeGenOpts.MemoryProfileOutput));
}

if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) {
if ((LangOpts.CUDAIsDevice || LangOpts.isSYCL()) && getTriple().isNVPTX()) {
// Indicate whether __nvvm_reflect should be configured to flush denormal
// floating point values to 0. (This corresponds to its "__CUDA_FTZ"
// property.)
getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
CodeGenOpts.FP32DenormalMode.Output !=
llvm::DenormalMode::IEEE);
getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt",
getTarget().getTargetOpts().NVVMCudaPrecSqrt);
}

if (LangOpts.EHAsynch)
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenCUDA/flush-denormals.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ extern "C" __device__ void foo() {}
// FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// NOFTZ-NOT: "denormal-fp-math-f32"

// PTXFTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
// PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}

// PTXNOFTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
// PTXNOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
11 changes: 11 additions & 0 deletions clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-prec-sqrt %s -o -| FileCheck --check-prefix=CHECK-ON %s
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s

#include "Inputs/cuda.h"

// Check that the -fcuda-prec-sqrt flag correctly sets the nvvm-reflect module flags.

extern "C" __device__ void foo() {}

// CHECK-ON: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1}
// CHECK-OFF: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 0}
19 changes: 11 additions & 8 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -343,19 +343,22 @@ Reflection Parameters
The libdevice library currently uses the following reflection parameters to
control code generation:

==================== ======================================================
Flag Description
==================== ======================================================
``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero
==================== ======================================================
=========================== ======================================================
Flag Description
=========================== ======================================================
``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero
``__CUDA_PREC_SQRT=[0,1]`` Use precise square root
=========================== ======================================================

The value of this flag is determined by the "nvvm-reflect-ftz" module flag.
The following sets the ftz flag to 1.
The value of these flags are determined by the "nvvm-reflect-ftz" and
"nvvm-reflect-prec-sqrt" module flags respectively.
The following sets the ftz flag to 1, and the precise sqrt flag to 1.

.. code-block:: llvm

!llvm.module.flag = !{!0}
!llvm.module.flag = !{!0, !1}
!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1}

(``i32 4`` indicates that the value set here overrides the value in another
module we link with. See the `LangRef <LangRef.html#module-flags-metadata>`
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVVMReflect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
ReflectVal = Flag->getSExtValue();
} else if (ReflectArg == "__CUDA_ARCH") {
ReflectVal = SmVersion * 10;
} else if (ReflectArg == "__CUDA_PREC_SQRT") {
// Try to pull __CUDA_PREC_SQRT from the nvvm-reflect-prec-sqrt module
// flag.
if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt")))
ReflectVal = Flag->getSExtValue();
}
Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
ToRemove.push_back(Call);
Expand Down
10 changes: 9 additions & 1 deletion llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,20 @@

declare i32 @__nvvm_reflect(i8*)
@str = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
@str.1 = private unnamed_addr addrspace(1) constant [17 x i8] c"__CUDA_PREC_SQRT\00"

define i32 @foo() {
%call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
; CHECK: ret i32 42
ret i32 %call
}

!llvm.module.flags = !{!0}
define i32 @foo_sqrt() {
%call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([17 x i8], [17 x i8] addrspace(1)* @str.1, i32 0, i32 0) to i8*))
; CHECK: ret i32 42
ret i32 %call
}

!llvm.module.flags = !{!0, !1}
!0 = !{i32 4, !"nvvm-reflect-ftz", i32 42}
!1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 42}
6 changes: 6 additions & 0 deletions sycl/doc/GetStartedGuide.md
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,12 @@ which contains all the symbols required.
GPU (SM 71), but it should work on any GPU compatible with SM 50 or above
* The NVIDIA OpenCL headers conflict with the OpenCL headers required for this
project and may cause compilation issues on some platforms
* `sycl::sqrt` is not correctly rounded by default as the SYCL specification
allows lower precision, when porting from CUDA it may be helpful to use
`-Xclang -fcuda-prec-sqrt` to use the correctly rounded square root, this is
significantly slower but matches the default precision used by `nvcc`, and
this `clang++` flag is equivalent to the `nvcc` `-prec-sqrt` flag, except that
it defaults to `false`.

### HIP back-end limitations

Expand Down