Skip to content

[NVPTX] Improve NVVMReflect Efficiency #134416

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Apr 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
81a413f
making nvvm reflect more efficient
YonahGoldberg Apr 4, 2025
cb928a8
cleanup
YonahGoldberg Apr 4, 2025
8b33667
newline
YonahGoldberg Apr 4, 2025
806af2c
reflect improvements
YonahGoldberg Apr 7, 2025
313f679
comment move
YonahGoldberg Apr 7, 2025
8e855e0
auto
YonahGoldberg Apr 7, 2025
4debb4a
remove nvidia copyright
YonahGoldberg Apr 7, 2025
a39bcd9
improve error messages
YonahGoldberg Apr 7, 2025
a5ea1cd
fix command line options
YonahGoldberg Apr 7, 2025
cd87a02
fix command line options
YonahGoldberg Apr 7, 2025
b402f04
final reflect cleanup
YonahGoldberg Apr 8, 2025
56b6622
final reflect cleanup
YonahGoldberg Apr 8, 2025
7d0e797
clang format
YonahGoldberg Apr 8, 2025
7c3bd38
forgot to set Changed variable
YonahGoldberg Apr 8, 2025
cfab804
typo in comment
YonahGoldberg Apr 8, 2025
0eac5e9
final cleanup
YonahGoldberg Apr 8, 2025
9b0e427
formatting
YonahGoldberg Apr 8, 2025
4177763
formatting
YonahGoldberg Apr 8, 2025
04d509d
int -> unsigned
YonahGoldberg Apr 8, 2025
9c54010
const correct probably
YonahGoldberg Apr 9, 2025
8429c81
format
YonahGoldberg Apr 9, 2025
d2b828d
forgot a getOperand call
YonahGoldberg Apr 9, 2025
cbd4793
format
YonahGoldberg Apr 9, 2025
7ddc7e4
format
YonahGoldberg Apr 9, 2025
1ac6c2b
newline
YonahGoldberg Apr 9, 2025
716c8f0
removing const
YonahGoldberg Apr 10, 2025
ece70ba
format
YonahGoldberg Apr 10, 2025
cc159a9
updated reflect options test
YonahGoldberg Apr 10, 2025
2056414
newline
YonahGoldberg Apr 10, 2025
47f280e
final style changes
YonahGoldberg Apr 10, 2025
ac0ffbf
clang format
YonahGoldberg Apr 10, 2025
b9c5704
Merge branch 'main' into nvvm-reflect
YonahGoldberg Apr 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions clang/test/CodeGen/builtins-nvptx.c
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
// RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
// ### The last run to check with the highest SM and PTX version available
// ### to make sure target builtins are still accepted.
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s

#define __device__ __attribute__((device))
Expand Down Expand Up @@ -61,6 +61,7 @@ __device__ bool reflect() {

unsigned x = __nvvm_reflect("__CUDA_ARCH");
return x >= 700;

}

__device__ int read_ntid() {
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Target/NVPTX/NVPTX.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ ModulePass *createNVPTXAssignValidGlobalNamesPass();
ModulePass *createGenericToNVVMLegacyPass();
ModulePass *createNVPTXCtorDtorLoweringLegacyPass();
FunctionPass *createNVVMIntrRangePass();
FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
ModulePass *createNVVMReflectPass(unsigned int SmVersion);
MachineFunctionPass *createNVPTXPrologEpilogPass();
MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
FunctionPass *createNVPTXImageOptimizerPass();
Expand All @@ -55,6 +55,7 @@ MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
MachineFunctionPass *createNVPTXForwardParamsPass();

void initializeNVVMReflectLegacyPassPass(PassRegistry &);
void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
void initializeNVPTXAllocaHoistingPass(PassRegistry &);
void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
Expand All @@ -78,9 +79,9 @@ struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
};

struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
NVVMReflectPass();
NVVMReflectPass() : SmVersion(0) {}
NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {}
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
PreservedAnalyses run(Module &F, ModuleAnalysisManager &AM);

private:
unsigned SmVersion;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#endif
MODULE_PASS("generic-to-nvvm", GenericToNVVMPass())
MODULE_PASS("nvptx-lower-ctor-dtor", NVPTXCtorDtorLoweringPass())
MODULE_PASS("nvvm-reflect", NVVMReflectPass())
#undef MODULE_PASS

#ifndef FUNCTION_ANALYSIS
Expand All @@ -36,7 +37,6 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA())
#define FUNCTION_PASS(NAME, CREATE_PASS)
#endif
FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
FUNCTION_PASS("nvvm-reflect", NVVMReflectPass())
FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this));
#undef FUNCTION_PASS
7 changes: 4 additions & 3 deletions llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
PassRegistry &PR = *PassRegistry::getPassRegistry();
// FIXME: This pass is really intended to be invoked during IR optimization,
// but it's very NVPTX-specific.
initializeNVVMReflectPass(PR);
initializeNVVMReflectLegacyPassPass(PR);
initializeNVVMIntrRangePass(PR);
initializeGenericToNVVMLegacyPassPass(PR);
initializeNVPTXAllocaHoistingPass(PR);
Expand Down Expand Up @@ -240,11 +240,12 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {

PB.registerPipelineStartEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
FunctionPassManager FPM;
// We do not want to fold out calls to nvvm.reflect early if the user
// has not provided a target architecture just yet.
if (Subtarget.hasTargetName())
FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
PM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));

FunctionPassManager FPM;
// Note: NVVMIntrRangePass was causing numerical discrepancies at one
// point, if issues crop up, consider disabling.
FPM.addPass(NVVMIntrRangePass());
Expand Down
Loading