Skip to content

[Clang][NVVM] Support -f[no-]cuda-prec-sqrt and propagate precision flag to NVVMReflect #134244

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions clang/include/clang/Basic/CodeGenOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,10 @@ class CodeGenOptions : public CodeGenOptionsBase {
/// CUDA runtime back-end for incorporating them into host-side object file.
std::string CudaGpuBinaryFileName;

/// Whether a precise or approximate square root should be used for CUDA
/// device code.
bool CudaPreciseSqrt;

/// List of filenames passed in using the -fembed-offload-object option. These
/// are offloading binaries containing device images and metadata.
std::vector<std::string> OffloadObjects;
Expand Down
5 changes: 5 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1279,6 +1279,11 @@ def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">
Alias<fgpu_flush_denormals_to_zero>;
def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">,
Alias<fno_gpu_flush_denormals_to_zero>;
defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
CodeGenOpts<"CudaPreciseSqrt">, DefaultFalse,
PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
NegFlag<SetFalse, [], [ClangOption], "Disable">,
BothFlags<[], [ClangOption], " precise square root for CUDA device code.">>;
def : Flag<["-"], "fcuda-rdc">, Alias<fgpu_rdc>;
def : Flag<["-"], "fno-cuda-rdc">, Alias<fno_gpu_rdc>;
defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/CodeGen/CodeGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1286,6 +1286,10 @@ void CodeGenModule::Release() {
}

if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) {
// Indicate whether __nvvm_reflect should be configured to use precise
// square root. (This corresponds to its "__CUDA_PREC_SQRT" property.)
getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt",
CodeGenOpts.CudaPreciseSqrt);
// Indicate whether __nvvm_reflect should be configured to flush denormal
// floating point values to 0. (This corresponds to its "__CUDA_FTZ"
// property.)
Expand Down
5 changes: 5 additions & 0 deletions clang/lib/Driver/ToolChains/Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
#include "llvm/Option/ArgList.h"
#include "llvm/Option/Option.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/FormatAdapters.h"
#include "llvm/Support/FormatVariadic.h"
Expand Down Expand Up @@ -862,6 +863,10 @@ void CudaToolChain::addClangTargetOptions(
if (CudaInstallation.version() >= CudaVersion::CUDA_90)
CC1Args.push_back("-fcuda-allow-variadic-functions");

if (DriverArgs.hasFlag(options::OPT_fcuda_prec_sqrt,
options::OPT_fno_cuda_prec_sqrt, false))
CC1Args.append({"-fcuda-prec-sqrt"});

if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
options::OPT_fno_cuda_short_ptr, false))
CC1Args.append({"-mllvm", "--nvptx-short-ptr"});
Expand Down
15 changes: 15 additions & 0 deletions clang/test/CodeGenCUDA/prec-sqrt.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// RUN: %clang_cc1 -fcuda-is-device \
// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
// RUN: FileCheck -check-prefixes=NO-PREC-SQRT %s

// RUN: %clang_cc1 -fcuda-is-device -fcuda-prec-sqrt \
// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
// RUN: FileCheck -check-prefixes=PREC-SQRT %s

#include "Inputs/cuda.h"

extern "C" __device__ void foo() {}


// NO-PREC-SQRT: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 0}
// PREC-SQRT: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1}
6 changes: 6 additions & 0 deletions clang/test/Driver/cuda-prec-sqrt.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// Checks that the -fcuda-prec-sqrt flag is passed to the cc1 frontend.

// RUN: %clang -### --target=x86_64-linux-gnu -c -fcuda-prec-sqrt -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 | FileCheck %s

// CHECK: "-triple" "nvptx64-nvidia-cuda"
// CHECK-SAME: "-fcuda-prec-sqrt"
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVVMReflect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
F.getParent()->getModuleFlag("nvvm-reflect-ftz")))
ReflectVal = Flag->getSExtValue();
} else if (ReflectArg == "__CUDA_PREC_SQRT") {
// Try to pull __CUDA_PREC_SQRT from the nvvm-reflect-prec-sqrt module
// flag.
if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt")))
ReflectVal = Flag->getSExtValue();
} else if (ReflectArg == "__CUDA_ARCH") {
ReflectVal = SmVersion * 10;
}
Expand Down
28 changes: 28 additions & 0 deletions llvm/test/CodeGen/NVPTX/nvvm-reflect-sqrt.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
; We run nvvm-reflect (and then optimize) this module twice, once with metadata
; that enables precise sqrt, and again with metadata that disables it.

; RUN: cat %s > %t.noprec
; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 0}' >> %t.noprec
; RUN: opt %t.noprec -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
; RUN: | FileCheck %s --check-prefix=PREC_SQRT_0 --check-prefix=CHECK

; RUN: cat %s > %t.prec
; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1}' >> %t.prec
; RUN: opt %t.prec -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \
; RUN: | FileCheck %s --check-prefix=PREC_SQRT_1 --check-prefix=CHECK

@.str = private unnamed_addr constant [17 x i8] c"__CUDA_PREC_SQRT\00", align 1

declare i32 @__nvvm_reflect(ptr)

; CHECK-LABEL: @foo
define i32 @foo() {
; CHECK-NOT: call i32 @__nvvm_reflect
%reflect = call i32 @__nvvm_reflect(ptr @.str)
; PREC_SQRT_0: ret i32 0
; PREC_SQRT_1: ret i32 1
ret i32 %reflect
}

!llvm.module.flags = !{!0}
; A module flag is added to the end of this file by the RUN lines at the top.