Skip to content

Commit 29b5c18

Browse files
authored
[NVPTX] Do not run the NVVMReflect pass as part of the normal pipeline (#121834)
Summary: This pass lowers the `__nvvm_reflect` builtin in the IR. However, this currently runs in the standard optimization pipeline, not just the backend pipeline. This means that if the user creates LLVM-IR without an architecture set, it will always delete the reflect code even if it is intended to be used later. Pushing this into the backend pipeline will ensure that this works as intended, allowing users to conditionally include code depending on which target architecture the user ended up using. This fixes a bug in OpenMP and missing code in `libc`.
1 parent ea14bdb commit 29b5c18

File tree

8 files changed

+37
-23
lines changed

8 files changed

+37
-23
lines changed

llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,18 @@ void NVPTXSubtarget::anchor() {}
3434

3535
NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
3636
StringRef FS) {
37-
// Provide the default CPU if we don't have one.
38-
TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
37+
TargetName = std::string(CPU);
3938

40-
ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
39+
ParseSubtargetFeatures(getTargetName(), /*TuneCPU=*/getTargetName(), FS);
4140

42-
// Re-map SM version numbers, SmVersion carries the regular SMs which do
43-
// have relative order, while FullSmVersion allows distinguishing sm_90 from
44-
// sm_90a, which would *not* be a subset of sm_91.
45-
SmVersion = getSmVersion();
41+
// Re-map SM version numbers, SmVersion carries the regular SMs which do
42+
// have relative order, while FullSmVersion allows distinguishing sm_90 from
43+
// sm_90a, which would *not* be a subset of sm_91.
44+
SmVersion = getSmVersion();
4645

47-
// Set default to PTX 6.0 (CUDA 9.0)
48-
if (PTXVersion == 0) {
49-
PTXVersion = 60;
46+
// Set default to PTX 6.0 (CUDA 9.0)
47+
if (PTXVersion == 0) {
48+
PTXVersion = 60;
5049
}
5150

5251
return *this;

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,12 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
111111
// - 0 represents base GPU model,
112112
// - non-zero value identifies particular architecture-accelerated variant.
113113
bool hasAAFeatures() const { return getFullSmVersion() % 10; }
114-
std::string getTargetName() const { return TargetName; }
114+
115+
// If the user did not provide a target we default to the `sm_30` target.
116+
std::string getTargetName() const {
117+
return TargetName.empty() ? "sm_30" : TargetName;
118+
}
119+
bool hasTargetName() const { return !TargetName.empty(); }
115120

116121
// Get maximum value of required alignments among the supported data types.
117122
// From the PTX ISA doc, section 8.2.3:

llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,10 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
255255
PB.registerPipelineStartEPCallback(
256256
[this](ModulePassManager &PM, OptimizationLevel Level) {
257257
FunctionPassManager FPM;
258-
FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
258+
// We do not want to fold out calls to nvvm.reflect early if the user
259+
// has not provided a target architecture just yet.
260+
if (Subtarget.hasTargetName())
261+
FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
259262
// Note: NVVMIntrRangePass was causing numerical discrepancies at one
260263
// point, if issues crop up, consider disabling.
261264
FPM.addPass(NVVMIntrRangePass());

llvm/lib/Target/NVPTX/NVVMReflect.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "NVPTX.h"
2222
#include "llvm/ADT/SmallVector.h"
2323
#include "llvm/Analysis/ConstantFolding.h"
24+
#include "llvm/CodeGen/CommandFlags.h"
2425
#include "llvm/IR/Constants.h"
2526
#include "llvm/IR/DerivedTypes.h"
2627
#include "llvm/IR/Function.h"
@@ -219,7 +220,12 @@ bool NVVMReflect::runOnFunction(Function &F) {
219220
return runNVVMReflect(F, SmVersion);
220221
}
221222

222-
NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {}
223+
NVVMReflectPass::NVVMReflectPass() {
224+
// Get the CPU string from the command line if not provided.
225+
StringRef SM = codegen::getMCPU();
226+
if (!SM.consume_front("sm_") || SM.consumeInteger(10, SmVersion))
227+
SmVersion = 0;
228+
}
223229

224230
PreservedAnalyses NVVMReflectPass::run(Function &F,
225231
FunctionAnalysisManager &AM) {

llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
; Libdevice in recent CUDA versions relies on __CUDA_ARCH reflecting GPU type.
22
; Verify that __nvvm_reflect() is replaced with an appropriate value.
33
;
4-
; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
4+
; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
55
; RUN: | FileCheck %s --check-prefixes=COMMON,SM20
6-
; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
6+
; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
77
; RUN: | FileCheck %s --check-prefixes=COMMON,SM35
88

99
@"$str" = private addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"

llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; Verify that __nvvm_reflect_ocl() is replaced with an appropriate value
22
;
3-
; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
3+
; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
44
; RUN: | FileCheck %s --check-prefixes=COMMON,SM20
5-
; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
5+
; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
66
; RUN: | FileCheck %s --check-prefixes=COMMON,SM35
77

88
@"$str" = private addrspace(4) constant [12 x i8] c"__CUDA_ARCH\00"

llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33

44
; RUN: cat %s > %t.noftz
55
; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
6-
; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
6+
; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
77
; RUN: | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
88

99
; RUN: cat %s > %t.ftz
1010
; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
11-
; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
11+
; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
1212
; RUN: | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
1313

1414
@str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,7 @@ exit:
4343

4444
declare i32 @llvm.nvvm.reflect(ptr)
4545

46-
; CHECK-LABEL: define noundef i32 @intrinsic
46+
; CHECK-LABEL: define i32 @intrinsic
4747
define i32 @intrinsic() {
4848
; CHECK-NOT: call i32 @llvm.nvvm.reflect
4949
; USE_FTZ_0: ret i32 0

llvm/test/CodeGen/NVPTX/nvvm-reflect.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33

44
; RUN: cat %s > %t.noftz
55
; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
6-
; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
6+
; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
77
; RUN: | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
88

99
; RUN: cat %s > %t.ftz
1010
; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
11-
; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
11+
; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
1212
; RUN: | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
1313

1414
@str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,8 @@ exit:
4343

4444
declare i32 @llvm.nvvm.reflect(ptr)
4545

46-
; CHECK-LABEL: define noundef i32 @intrinsic
46+
; CHECK-LABEL: define i32 @intrinsic
47+
4748
define i32 @intrinsic() {
4849
; CHECK-NOT: call i32 @llvm.nvvm.reflect
4950
; USE_FTZ_0: ret i32 0

0 commit comments

Comments
 (0)