Skip to content

[AMDGPU] Introduce "amdgpu-uniform-intrinsic-combine" pass to combine uniform AMDGPU lane Intrinsics. #116953

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 40 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
be7ad78
[WIP][AMDGPU] combine uniform AMDGPU lane Intrinsics
PankajDwivedi-25 Nov 21, 2024
8dae2e7
refactored and updated intrinsics handling
PankajDwivedi-25 Nov 22, 2024
d8d3666
removed redundant casting
PankajDwivedi-25 Nov 22, 2024
4311e65
refactored, added more test
PankajDwivedi-25 Dec 5, 2024
1bed57f
integrated in pipeline, more test added
PankajDwivedi-25 Jan 9, 2025
ed204b9
removed unused gfx checks
PankajDwivedi-25 Jan 15, 2025
847fef4
added pass to llc pipeline, more test added
PankajDwivedi-25 Feb 7, 2025
e0fc6fc
handled ballot with icmp for trivial waterfall
PankajDwivedi-25 Feb 18, 2025
661ce96
updated test
PankajDwivedi-25 Feb 18, 2025
8963961
Fix: use isDivergentUse instead isUniform
PankajDwivedi-25 Feb 19, 2025
5fb5f8c
addressed reviews
PankajDwivedi-25 Feb 19, 2025
982096c
pull the ballot argument to all the match users
PankajDwivedi-25 Feb 20, 2025
d4b7ec0
Match and replace icmp ballot,0 with XOR
PankajDwivedi-25 Feb 21, 2025
6297b9d
Rebase: resolve merge
PankajDwivedi-25 Feb 24, 2025
603d5f6
remove undef test
PankajDwivedi-25 Feb 24, 2025
2781457
add icmp ne case, and test
PankajDwivedi-25 Mar 3, 2025
261b4ff
Add run line shows difference without this pass in pipeline
PankajDwivedi-25 Mar 4, 2025
c34d392
addressed reviews
PankajDwivedi-25 Mar 5, 2025
eb73c6a
address reviews
PankajDwivedi-25 Mar 6, 2025
1bca2e7
remove icmp ballot,1
PankajDwivedi-25 Mar 12, 2025
01e3ed6
address review
PankajDwivedi-25 Mar 13, 2025
51cb723
Exit early if module don't have intrinsic declaration in it
PankajDwivedi-25 Mar 13, 2025
c9ace74
pass insertion point into createNot
PankajDwivedi-25 Mar 15, 2025
a1a0706
avoiding unnecessary function-wide scanning
PankajDwivedi-25 Mar 17, 2025
316472c
Erase dead intrinsic call
PankajDwivedi-25 Mar 18, 2025
f22d719
Bug: Possible issue if II still have users
PankajDwivedi-25 Mar 19, 2025
f8da0bc
Exit early if ballot has no Icmp use
PankajDwivedi-25 Mar 19, 2025
5e3c8fa
Refactor: address reviews
PankajDwivedi-25 Mar 19, 2025
8a8f4f4
Address review
PankajDwivedi-25 Mar 20, 2025
f5b900d
Extend intrinsic types
PankajDwivedi-25 Mar 25, 2025
0dcbc1c
Iterate over functions in module to check if dec exists
PankajDwivedi-25 Mar 26, 2025
bb3a69e
refactor: address review
PankajDwivedi-25 Apr 1, 2025
61d1024
Refactor: update UI to ref instead pointer
PankajDwivedi-25 Apr 11, 2025
c058c3c
Remove OPM support
PankajDwivedi-25 Apr 11, 2025
d2b8976
Refactor: address review
PankajDwivedi-25 Apr 11, 2025
5b4cf1c
address reviews
PankajDwivedi-25 Apr 14, 2025
c96c9e8
use switch rather iterating over list
PankajDwivedi-25 Apr 14, 2025
5b5b32a
Refactor: change to module pass
PankajDwivedi-25 Apr 14, 2025
684d561
Refactor
PankajDwivedi-25 Apr 14, 2025
45c7468
refactor
PankajDwivedi-25 Apr 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,11 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;

struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};

namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef MODULE_PASS

#ifndef MODULE_PASS_WITH_PARAMS
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);

static cl::opt<bool> EnableUniformIntrinsicCombine(
"amdgpu-enable-uniform-intrinsic-combine",
cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
cl::init(true), cl::Hidden);

extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
Expand Down Expand Up @@ -827,6 +832,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {

if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());

if (EnableUniformIntrinsicCombine)
PM.addPass(AMDGPUUniformIntrinsicCombinePass());
});

PB.registerPeepholeEPCallback(
Expand Down
138 changes: 138 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass simplifies certain intrinsic calls when the arguments are uniform.
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"

using namespace llvm;
using namespace llvm::AMDGPU;
using namespace llvm::PatternMatch;

/// Optimizes uniform intrinsics.
static bool optimizeUniformIntrinsic(IntrinsicInst &II,
const UniformityInfo &UI) {
llvm::Intrinsic::ID IID = II.getIntrinsicID();

switch (IID) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
// Check if the argument use is divergent
if (UI.isDivergentUse(II.getOperandUse(0)))
return false;
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
II.replaceAllUsesWith(Src);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is replacing a contextually / path dependent query with a value that is not. I think we need to attach some kind of convergent use call to capture the point here. What if later code motion moves it such that an assumed uniform value is no longer use-point uniform?

You can maybe get away with replace only dominated uses by this instruction, but I'd need to think if there are still potential hazards if later transforms introduce divergence

Copy link
Contributor Author

@PankajDwivedi-25 PankajDwivedi-25 Apr 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't we already discussed something similar earlier? #116953 (review)

Copy link
Collaborator

@ssahasra ssahasra Apr 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are replacing an "always uniform" value X (the intrinsic call) with its uniform operand (Y), which can potentially become uniform divergent after other transformations. But that is no different from a later transformation treating X (the intrinsic call) as divergent by simply ignoring the uniformity analysis at that point of the optimization pipeline. By replacing X with Y, we lose the ability to preserve uniformity but this is not a correctness issue. This is still a performance issue, and then the question is whether the current optimization is useful in general or not.

II.eraseFromParent();
return true;
}
case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
if (UI.isDivergentUse(II.getOperandUse(0)))
return false;
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');

// If there are no ICmp users, return early.
if (none_of(II.users(), [](User *U) { return isa<ICmpInst>(U); }))
return false;

bool Changed = false;
for (User *U : make_early_inc_range(II.users())) {
if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
Value *Op0 = ICmp->getOperand(0);
Value *Op1 = ICmp->getOperand(1);
ICmpInst::Predicate Pred = ICmp->getPredicate();
Value *OtherOp = Op0 == &II ? Op1 : Op0;

if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
// Case (icmp eq %ballot, 0) --> xor %ballot_arg, 1
Instruction *NotOp =
BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
ICmp->eraseFromParent();
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
// (icmp ne %ballot, 0) --> %ballot_arg
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
ICmp->eraseFromParent();
Changed = true;
}
}
}
// Erase the intrinsic if it has no remaining uses.
if (II.use_empty())
II.eraseFromParent();
return Changed;
}
default:
llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
}
return false;
}

/// Iterate over the Intrinsics use in the Module to optimise.
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
bool IsChanged = false;
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
for (Function &F : M) {
switch (F.getIntrinsicID()) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_ballot:
break;
default:
continue;
}

for (User *U : F.users()) {
auto *II = cast<IntrinsicInst>(U);
Function *ParentF = II->getFunction();
if (ParentF->isDeclaration())
continue;

const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
IsChanged |= optimizeUniformIntrinsic(*II, UI);
}
}
return IsChanged;
}

PreservedAnalyses
AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
if (!runUniformIntrinsicCombine(M, AM))
return PreservedAnalyses::all();

PreservedAnalyses PA;
PA.preserve<UniformityInfoAnalysis>();
return PA;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
AMDGPUUniformIntrinsicCombine.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUISelDAGToDAG.cpp
Expand Down
Loading
Loading