[SYCL][CUDA] Add -fcuda-prec-sqrt flag (#5141)

npmiller · web-flow · commit ebf9ea871ea0 · 2021-12-31T19:14:08.000+03:00
This patch add `__nvvm_reflect` support for `__CUDA_PREC_SQRT` and adds a `-Xclang -fcuda-prec-sqrt` flag which is equivalent to the `nvcc` `-prec-sqrt` flag, except that it defaults to `false` for `clang++` and to `true` for `nvcc`. The reason for that is that the SYCL specification doesn't require a correctly rounded `sqrt` so we likely want to keep the fast `sqrt` as a default and use the flag when higher precision is required. See additional discussion on #4041 and #5116
diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
@@ -75,6 +75,9 @@ class TargetOptions {
   /// address space.
   bool NVPTXUseShortPointers = false;
 
+  /// \brief If enabled, use precise square root
+  bool NVVMCudaPrecSqrt = false;
+
   /// \brief If enabled, allow AMDGPU unsafe floating point atomics.
   bool AllowAMDGPUUnsafeFPAtomics = false;
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
@@ -980,6 +980,11 @@ defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
   TargetOpts<"NVPTXUseShortPointers">, DefaultFalse,
   PosFlag<SetTrue, [CC1Option], "Use 32-bit pointers for accessing const/local/shared address spaces">,
   NegFlag<SetFalse>>;
+defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
+  TargetOpts<"NVVMCudaPrecSqrt">, DefaultFalse,
+  PosFlag<SetTrue, [CC1Option], "Specify">,
+  NegFlag<SetFalse, [], "Don't specify">,
+  BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>;
 def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
   HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
 def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -771,13 +771,15 @@ void CodeGenModule::Release() {
         llvm::MDString::get(Ctx, CodeGenOpts.MemoryProfileOutput));
   }
 
-  if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) {
+  if ((LangOpts.CUDAIsDevice || LangOpts.isSYCL()) && getTriple().isNVPTX()) {
     // Indicate whether __nvvm_reflect should be configured to flush denormal
     // floating point values to 0.  (This corresponds to its "__CUDA_FTZ"
     // property.)
     getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
                               CodeGenOpts.FP32DenormalMode.Output !=
                                   llvm::DenormalMode::IEEE);
+    getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt",
+                              getTarget().getTargetOpts().NVVMCudaPrecSqrt);
   }
 
   if (LangOpts.EHAsynch)
diff --git a/clang/test/CodeGenCUDA/flush-denormals.cu b/clang/test/CodeGenCUDA/flush-denormals.cu
@@ -44,8 +44,8 @@ extern "C" __device__ void foo() {}
 // FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
 // NOFTZ-NOT: "denormal-fp-math-f32"
 
-// PTXFTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
+// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
 // PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
 
-// PTXNOFTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
+// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
 // PTXNOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
diff --git a/clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu b/clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-prec-sqrt %s -o -| FileCheck --check-prefix=CHECK-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s
+
+#include "Inputs/cuda.h"
+
+// Check that the -fcuda-prec-sqrt flag correctly sets the nvvm-reflect module flags.
+
+extern "C" __device__ void foo() {}
+
+// CHECK-ON: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1}
+// CHECK-OFF: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 0}
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
@@ -343,19 +343,22 @@ Reflection Parameters
 The libdevice library currently uses the following reflection parameters to
 control code generation:
 
-==================== ======================================================
-Flag                 Description
-==================== ======================================================
-``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero
-==================== ======================================================
+=========================== ======================================================
+Flag                        Description
+=========================== ======================================================
+``__CUDA_FTZ=[0,1]``        Use optimized code paths that flush subnormals to zero
+``__CUDA_PREC_SQRT=[0,1]``  Use precise square root
+=========================== ======================================================
 
-The value of this flag is determined by the "nvvm-reflect-ftz" module flag.
-The following sets the ftz flag to 1.
+The value of these flags are determined by the "nvvm-reflect-ftz" and
+"nvvm-reflect-prec-sqrt" module flags respectively.
+The following sets the ftz flag to 1, and the precise sqrt flag to 1.
 
 .. code-block:: llvm
 
-    !llvm.module.flag = !{!0}
+    !llvm.module.flag = !{!0, !1}
     !0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+    !1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1}
 
 (``i32 4`` indicates that the value set here overrides the value in another
 module we link with.  See the `LangRef <LangRef.html#module-flags-metadata>`
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -170,6 +170,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
         ReflectVal = Flag->getSExtValue();
     } else if (ReflectArg == "__CUDA_ARCH") {
       ReflectVal = SmVersion * 10;
+    } else if (ReflectArg == "__CUDA_PREC_SQRT") {
+      // Try to pull __CUDA_PREC_SQRT from the nvvm-reflect-prec-sqrt module
+      // flag.
+      if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
+              F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt")))
+        ReflectVal = Flag->getSExtValue();
     }
     Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
     ToRemove.push_back(Call);
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll
@@ -3,12 +3,20 @@
 
 declare i32 @__nvvm_reflect(i8*)
 @str = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
+@str.1 = private unnamed_addr addrspace(1) constant [17 x i8] c"__CUDA_PREC_SQRT\00"
 
 define i32 @foo() {
   %call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
   ; CHECK: ret i32 42
   ret i32 %call
 }
 
-!llvm.module.flags = !{!0}
+define i32 @foo_sqrt() {
+  %call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([17 x i8], [17 x i8] addrspace(1)* @str.1, i32 0, i32 0) to i8*))
+  ; CHECK: ret i32 42
+  ret i32 %call
+}
+
+!llvm.module.flags = !{!0, !1}
 !0 = !{i32 4, !"nvvm-reflect-ftz", i32 42}
+!1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 42}
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
@@ -830,6 +830,12 @@ which contains all the symbols required.
   GPU (SM 71), but it should work on any GPU compatible with SM 50 or above
 * The NVIDIA OpenCL headers conflict with the OpenCL headers required for this
   project and may cause compilation issues on some platforms
+* `sycl::sqrt` is not correctly rounded by default as the SYCL specification
+  allows lower precision, when porting from CUDA it may be helpful to use
+  `-Xclang -fcuda-prec-sqrt` to use the correctly rounded square root, this is
+  significantly slower but matches the default precision used by `nvcc`, and
+  this `clang++` flag is equivalent to the `nvcc` `-prec-sqrt` flag, except that
+  it defaults to `false`.
 
 ### HIP back-end limitations