Skip to content

Commit 701d726

Browse files
[NVPTX] Improve NVVMReflect Efficiency (#134416)
The NVVMReflect pass simply replaces calls to nvvm-reflect functions with the appropriate constant, either the architecture number, or nvvm-reflect-ftz, found in the module's metadata. The implementation is inefficient and does this by traversing through all instructions to find calls. The common case is that you never call nvvm-reflect, so this traversal is costly. This PR: - Updates the pass so that it finds the reflect functions by name, and then traverses through their uses to find the calls directly. - Adds a line (245) to make sure the dead nvvm-reflect definitions are erased. - Adds the ability to set reflect values via command line
1 parent 1711996 commit 701d726

File tree

6 files changed

+237
-165
lines changed

6 files changed

+237
-165
lines changed

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,37 @@
11
// REQUIRES: nvptx-registered-target
22
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
3-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
3+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
44
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
55
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
6-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
6+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
77
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
88
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
9-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
9+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
1010
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
1111
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
12-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
12+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
1313
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
1414
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
15-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
15+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
1616
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
1717
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
18-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
18+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
1919
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
2020
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
2121
// RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
2222
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
23-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
23+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
2424
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
2525
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
26-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
26+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
2727
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
2828
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
29-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
29+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
3030
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
3131
// ### The last run to check with the highest SM and PTX version available
3232
// ### to make sure target builtins are still accepted.
3333
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
34-
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
34+
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
3535
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
3636

3737
#define __device__ __attribute__((device))
@@ -61,6 +61,7 @@ __device__ bool reflect() {
6161

6262
unsigned x = __nvvm_reflect("__CUDA_ARCH");
6363
return x >= 700;
64+
6465
}
6566

6667
__device__ int read_ntid() {

llvm/lib/Target/NVPTX/NVPTX.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ ModulePass *createNVPTXAssignValidGlobalNamesPass();
4343
ModulePass *createGenericToNVVMLegacyPass();
4444
ModulePass *createNVPTXCtorDtorLoweringLegacyPass();
4545
FunctionPass *createNVVMIntrRangePass();
46-
FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
46+
ModulePass *createNVVMReflectPass(unsigned int SmVersion);
4747
MachineFunctionPass *createNVPTXPrologEpilogPass();
4848
MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
4949
FunctionPass *createNVPTXImageOptimizerPass();
@@ -55,6 +55,7 @@ MachineFunctionPass *createNVPTXPeephole();
5555
MachineFunctionPass *createNVPTXProxyRegErasurePass();
5656
MachineFunctionPass *createNVPTXForwardParamsPass();
5757

58+
void initializeNVVMReflectLegacyPassPass(PassRegistry &);
5859
void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
5960
void initializeNVPTXAllocaHoistingPass(PassRegistry &);
6061
void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
@@ -78,9 +79,9 @@ struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
7879
};
7980

8081
struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
81-
NVVMReflectPass();
82+
NVVMReflectPass() : SmVersion(0) {}
8283
NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {}
83-
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
84+
PreservedAnalyses run(Module &F, ModuleAnalysisManager &AM);
8485

8586
private:
8687
unsigned SmVersion;

llvm/lib/Target/NVPTX/NVPTXPassRegistry.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#endif
1919
MODULE_PASS("generic-to-nvvm", GenericToNVVMPass())
2020
MODULE_PASS("nvptx-lower-ctor-dtor", NVPTXCtorDtorLoweringPass())
21+
MODULE_PASS("nvvm-reflect", NVVMReflectPass())
2122
#undef MODULE_PASS
2223

2324
#ifndef FUNCTION_ANALYSIS
@@ -36,7 +37,6 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA())
3637
#define FUNCTION_PASS(NAME, CREATE_PASS)
3738
#endif
3839
FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
39-
FUNCTION_PASS("nvvm-reflect", NVVMReflectPass())
4040
FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
4141
FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this));
4242
#undef FUNCTION_PASS

llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
9595
PassRegistry &PR = *PassRegistry::getPassRegistry();
9696
// FIXME: This pass is really intended to be invoked during IR optimization,
9797
// but it's very NVPTX-specific.
98-
initializeNVVMReflectPass(PR);
98+
initializeNVVMReflectLegacyPassPass(PR);
9999
initializeNVVMIntrRangePass(PR);
100100
initializeGenericToNVVMLegacyPassPass(PR);
101101
initializeNVPTXAllocaHoistingPass(PR);
@@ -240,11 +240,12 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
240240

241241
PB.registerPipelineStartEPCallback(
242242
[this](ModulePassManager &PM, OptimizationLevel Level) {
243-
FunctionPassManager FPM;
244243
// We do not want to fold out calls to nvvm.reflect early if the user
245244
// has not provided a target architecture just yet.
246245
if (Subtarget.hasTargetName())
247-
FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
246+
PM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
247+
248+
FunctionPassManager FPM;
248249
// Note: NVVMIntrRangePass was causing numerical discrepancies at one
249250
// point, if issues crop up, consider disabling.
250251
FPM.addPass(NVVMIntrRangePass());

0 commit comments

Comments
 (0)