Skip to content

[NVPTX] Pull invariant load identification into IR pass #138015

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions llvm/lib/Target/NVPTX/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,34 +13,35 @@ add_public_tablegen_target(NVPTXCommonTableGen)
set(NVPTXCodeGen_sources
NVPTXAliasAnalysis.cpp
NVPTXAllocaHoisting.cpp
NVPTXAtomicLower.cpp
NVPTXAsmPrinter.cpp
NVPTXAssignValidGlobalNames.cpp
NVPTXAtomicLower.cpp
NVPTXCtorDtorLowering.cpp
NVPTXForwardParams.cpp
NVPTXFrameLowering.cpp
NVPTXGenericToNVVM.cpp
NVPTXISelDAGToDAG.cpp
NVPTXISelLowering.cpp
NVPTXImageOptimizer.cpp
NVPTXInstrInfo.cpp
NVPTXISelDAGToDAG.cpp
NVPTXISelLowering.cpp
NVPTXLowerAggrCopies.cpp
NVPTXLowerArgs.cpp
NVPTXLowerAlloca.cpp
NVPTXLowerArgs.cpp
NVPTXLowerUnreachable.cpp
NVPTXPeephole.cpp
NVPTXMCExpr.cpp
NVPTXPeephole.cpp
NVPTXPrologEpilogPass.cpp
NVPTXProxyRegErasure.cpp
NVPTXRegisterInfo.cpp
NVPTXReplaceImageHandles.cpp
NVPTXSelectionDAGInfo.cpp
NVPTXSubtarget.cpp
NVPTXTagInvariantLoads.cpp
NVPTXTargetMachine.cpp
NVPTXTargetTransformInfo.cpp
NVPTXUtilities.cpp
NVVMIntrRange.cpp
NVVMReflect.cpp
NVPTXProxyRegErasure.cpp
NVPTXCtorDtorLowering.cpp
)

add_llvm_target(NVPTXCodeGen
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTX.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ FunctionPass *createNVPTXLowerArgsPass();
FunctionPass *createNVPTXLowerAllocaPass();
FunctionPass *createNVPTXLowerUnreachablePass(bool TrapUnreachable,
bool NoTrapAfterNoreturn);
FunctionPass *createNVPTXTagInvariantLoadsPass();
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
MachineFunctionPass *createNVPTXForwardParamsPass();
Expand All @@ -73,6 +74,7 @@ void initializeNVVMReflectPass(PassRegistry &);
void initializeNVPTXAAWrapperPassPass(PassRegistry &);
void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
void initializeNVPTXPeepholePass(PassRegistry &);
void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);

struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
Expand Down Expand Up @@ -104,6 +106,10 @@ struct NVPTXLowerArgsPass : PassInfoMixin<NVPTXLowerArgsPass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};

struct NVPTXTagInvariantLoadsPass : PassInfoMixin<NVPTXTagInvariantLoadsPass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};

namespace NVPTX {
enum DrvInterface {
NVCL,
Expand Down
52 changes: 8 additions & 44 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -766,46 +766,12 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
llvm_unreachable("unhandled ordering");
}

static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
unsigned CodeAddrSpace, MachineFunction *F) {
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
unsigned CodeAddrSpace) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
// space.
//
// We have two ways of identifying invariant loads: Loads may be explicitly
// marked as invariant, or we may infer them to be invariant.
//
// We currently infer invariance for loads from
// - constant global variables, and
// - kernel function pointer params that are noalias (i.e. __restrict) and
// never written to.
//
// TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
// not during the SelectionDAG phase).
//
// TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
// explicitly invariant loads because these are how clang tells us to use ldg
// when the user uses a builtin.
if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global)
return false;

if (N->isInvariant())
return true;

bool IsKernelFn = isKernelFunction(F->getFunction());

// We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
// because the former looks through phi nodes while the latter does not. We
// need to look through phi nodes to handle pointer induction variables.
SmallVector<const Value *, 8> Objs;
getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);

return all_of(Objs, [&](const Value *V) {
if (auto *A = dyn_cast<const Argument>(V))
return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
if (auto *GV = dyn_cast<const GlobalVariable>(V))
return GV->isConstant();
return false;
});
return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
N.isInvariant();
}

static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
Expand Down Expand Up @@ -1106,10 +1072,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
return false;

// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
return tryLDGLDU(N);
}

SDLoc DL(N);
SDValue Chain = N->getOperand(0);
Expand Down Expand Up @@ -1192,10 +1157,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
const MVT MemVT = MemEVT.getSimpleVT();

// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
if (canLowerToLDG(*MemSD, *Subtarget, CodeAddrSpace))
return tryLDGLDU(N);
}

EVT EltVT = N->getValueType(0);
SDLoc DL(N);
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,6 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA())
#endif
FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this));
FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
#undef FUNCTION_PASS
104 changes: 104 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
//===------ NVPTXTagInvariantLoads.cpp - Tag invariant loads --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements invaraint load tagging. It traverses load instructions
// in a function, and determines if each load can be tagged as invariant.
//
// We currently infer invariance for loads from
// - constant global variables, and
// - kernel function pointer params that are noalias (i.e. __restrict) and
// never written to.
Comment on lines +14 to +15
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can also consider grid constant arguments.

//
// TODO: Perform a more powerful invariance analysis (ideally IPO).
//
//===----------------------------------------------------------------------===//

#include "NVPTXUtilities.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/NVPTXAddrSpace.h"

using namespace llvm;

static bool isInvariantLoad(const LoadInst *LI, const bool IsKernelFn) {
// Don't bother with non-global loads
if (LI->getPointerAddressSpace() != NVPTXAS::ADDRESS_SPACE_GLOBAL)
return false;

// If the load is already marked as invariant, we don't need to do anything
if (LI->getMetadata(LLVMContext::MD_invariant_load))
return false;

// We use getUnderlyingObjects() here instead of getUnderlyingObject()
// mainly because the former looks through phi nodes while the latter does
// not. We need to look through phi nodes to handle pointer induction
// variables.
SmallVector<const Value *, 8> Objs;
getUnderlyingObjects(LI->getPointerOperand(), Objs);

return all_of(Objs, [&](const Value *V) {
if (const auto *A = dyn_cast<const Argument>(V))
return IsKernelFn && ((A->onlyReadsMemory() && A->hasNoAliasAttr()) ||
isParamGridConstant(*A));
if (const auto *GV = dyn_cast<const GlobalVariable>(V))
return GV->isConstant();
return false;
});
}

static void markLoadsAsInvariant(LoadInst *LI) {
LI->setMetadata(LLVMContext::MD_invariant_load,
MDNode::get(LI->getContext(), {}));
}

static bool tagInvariantLoads(Function &F) {
const bool IsKernelFn = isKernelFunction(F);

bool Changed = false;
for (auto &I : instructions(F)) {
if (auto *LI = dyn_cast<LoadInst>(&I)) {
if (isInvariantLoad(LI, IsKernelFn)) {
markLoadsAsInvariant(LI);
Changed = true;
}
}
}
return Changed;
}

namespace {

struct NVPTXTagInvariantLoadLegacyPass : public FunctionPass {
static char ID;

NVPTXTagInvariantLoadLegacyPass() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override;
};

} // namespace

INITIALIZE_PASS(NVPTXTagInvariantLoadLegacyPass, "nvptx-tag-invariant-loads",
"NVPTX Tag Invariant Loads", false, false)

bool NVPTXTagInvariantLoadLegacyPass::runOnFunction(Function &F) {
return tagInvariantLoads(F);
}

char NVPTXTagInvariantLoadLegacyPass::ID = 0;

FunctionPass *llvm::createNVPTXTagInvariantLoadsPass() {
return new NVPTXTagInvariantLoadLegacyPass();
}

PreservedAnalyses NVPTXTagInvariantLoadsPass::run(Function &F,
FunctionAnalysisManager &) {
return tagInvariantLoads(F) ? PreservedAnalyses::none()
: PreservedAnalyses::all();
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
initializeNVPTXAAWrapperPassPass(PR);
initializeNVPTXExternalAAWrapperPass(PR);
initializeNVPTXPeepholePass(PR);
initializeNVPTXTagInvariantLoadLegacyPassPass(PR);
}

static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
Expand Down Expand Up @@ -395,6 +396,7 @@ void NVPTXPassConfig::addIRPasses() {
if (!DisableLoadStoreVectorizer)
addPass(createLoadStoreVectorizerPass());
addPass(createSROAPass());
addPass(createNVPTXTagInvariantLoadsPass());
}

if (ST.hasPTXASUnreachableBug()) {
Expand Down
33 changes: 33 additions & 0 deletions llvm/test/CodeGen/NVPTX/byval-const-global.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mcpu=sm_70 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 | %ptxas-verify %}

target triple = "nvptx64-nvidia-cuda"

%struct = type { [2 x i64] }
@G = external constant %struct

define void @foo() {
; CHECK-LABEL: foo(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.global.u64 %rd1, [G];
; CHECK-NEXT: ld.global.u64 %rd2, [G+8];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 8 .b8 param0[16];
; CHECK-NEXT: st.param.b64 [param0], %rd1;
; CHECK-NEXT: st.param.b64 [param0+8], %rd2;
; CHECK-NEXT: call.uni
; CHECK-NEXT: bar,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: ret;
call void @bar(ptr byval(%struct) @G)
ret void
}

declare void @bar(ptr)
Loading