Skip to content

AMDGPU: Declare pass control flags in header #102865

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "llvm/Transforms/Scalar/StructurizeCFG.h"
#include "llvm/Transforms/Utils/FixIrreducible.h"
#include "llvm/Transforms/Utils/LCSSA.h"
#include "llvm/Transforms/Utils/LowerSwitch.h"
#include "llvm/Transforms/Utils/UnifyLoopExits.h"

using namespace llvm;
Expand All @@ -35,6 +36,16 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
ShadowStackGCLoweringPass>();
}

void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
Base::addCodeGenPrepare(addPass);

// LowerSwitch pass may introduce unreachable blocks that can cause unexpected
// behavior for subsequent passes. Placing it here seems better that these
// blocks would get cleaned up by UnreachableBlockElim inserted next in the
// pass flow.
addPass(LowerSwitchPass());
}

void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
const bool LateCFGStructurize = AMDGPUTargetMachine::EnableLateStructurizeCFG;
const bool DisableStructurizer = AMDGPUTargetMachine::DisableStructurizer;
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ class GCNTargetMachine;
class AMDGPUCodeGenPassBuilder
: public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
public:
using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;

AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
const CGPassBuilderOption &Opts,
PassInstrumentationCallbacks *PIC);

void addCodeGenPrepare(AddIRPass &) const;
void addPreISel(AddIRPass &addPass) const;
void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
Error addInstSelector(AddMachinePass &) const;
Expand Down
198 changes: 90 additions & 108 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,14 @@
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LowerSwitch.h"
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include <optional>

using namespace llvm;
using namespace llvm::PatternMatch;
using namespace llvm::AMDGPU;

namespace {
class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
Expand Down Expand Up @@ -185,109 +187,95 @@ static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
} // anonymous namespace

static cl::opt<bool>
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
cl::desc("Run early if-conversion"),
cl::init(false));
namespace llvm::AMDGPU {
cl::opt<bool> EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
cl::desc("Run early if-conversion"),
cl::init(false));

static cl::opt<bool>
OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
cl::desc("Run pre-RA exec mask optimizations"),
cl::init(true));
cl::opt<bool> OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
cl::desc("Run pre-RA exec mask optimizations"),
cl::init(true));

static cl::opt<bool>
cl::opt<bool>
LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
cl::desc("Lower GPU ctor / dtors to globals on the device."),
cl::init(true), cl::Hidden);

// Option to disable vectorizer for tests.
static cl::opt<bool> EnableLoadStoreVectorizer(
"amdgpu-load-store-vectorizer",
cl::desc("Enable load store vectorizer"),
cl::init(true),
cl::Hidden);
cl::opt<bool>
EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer",
cl::desc("Enable load store vectorizer"),
cl::init(true), cl::Hidden);

// Option to control global loads scalarization
static cl::opt<bool> ScalarizeGlobal(
"amdgpu-scalarize-global-loads",
cl::desc("Enable global load scalarization"),
cl::init(true),
cl::Hidden);
cl::opt<bool> ScalarizeGlobal("amdgpu-scalarize-global-loads",
cl::desc("Enable global load scalarization"),
cl::init(true), cl::Hidden);

// Option to run internalize pass.
static cl::opt<bool> InternalizeSymbols(
"amdgpu-internalize-symbols",
cl::desc("Enable elimination of non-kernel functions and unused globals"),
cl::init(false),
cl::Hidden);
cl::opt<bool> InternalizeSymbols(
"amdgpu-internalize-symbols",
cl::desc("Enable elimination of non-kernel functions and unused globals"),
cl::init(false), cl::Hidden);

// Option to inline all early.
static cl::opt<bool> EarlyInlineAll(
"amdgpu-early-inline-all",
cl::desc("Inline all functions early"),
cl::init(false),
cl::Hidden);
cl::opt<bool> EarlyInlineAll("amdgpu-early-inline-all",
cl::desc("Inline all functions early"),
cl::init(false), cl::Hidden);

static cl::opt<bool> RemoveIncompatibleFunctions(
cl::opt<bool> RemoveIncompatibleFunctions(
"amdgpu-enable-remove-incompatible-functions", cl::Hidden,
cl::desc("Enable removal of functions when they"
"use features not supported by the target GPU"),
cl::init(true));

static cl::opt<bool> EnableSDWAPeephole(
"amdgpu-sdwa-peephole",
cl::desc("Enable SDWA peepholer"),
cl::init(true));
cl::opt<bool> EnableSDWAPeephole("amdgpu-sdwa-peephole",
cl::desc("Enable SDWA peepholer"),
cl::init(true));

static cl::opt<bool> EnableDPPCombine(
"amdgpu-dpp-combine",
cl::desc("Enable DPP combiner"),
cl::init(true));
cl::opt<bool> EnableDPPCombine("amdgpu-dpp-combine",
cl::desc("Enable DPP combiner"), cl::init(true));

// Enable address space based alias analysis
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
cl::desc("Enable AMDGPU Alias Analysis"),
cl::init(true));
cl::opt<bool>
EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
cl::desc("Enable AMDGPU Alias Analysis"),
cl::init(true));

// Option to run late CFG structurizer
static cl::opt<bool, true> LateCFGStructurize(
"amdgpu-late-structurize",
cl::desc("Enable late CFG structurization"),
cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
cl::Hidden);
cl::opt<bool, true> LateCFGStructurize(
"amdgpu-late-structurize", cl::desc("Enable late CFG structurization"),
cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden);

// Disable structurizer-based control-flow lowering in order to test convergence
// control tokens. This should eventually be replaced by the wave-transform.
static cl::opt<bool, true> DisableStructurizer(
cl::opt<bool, true> DisableStructurizer(
"amdgpu-disable-structurizer",
cl::desc("Disable structurizer for experiments; produces unusable code"),
cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden);

// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
cl::desc("Enable amdgpu library simplifications"),
cl::init(true),
cl::Hidden);

static cl::opt<bool> EnableLowerKernelArguments(
"amdgpu-ir-lower-kernel-arguments",
cl::desc("Lower kernel argument loads in IR pass"),
cl::init(true),
cl::Hidden);

static cl::opt<bool> EnableRegReassign(
"amdgpu-reassign-regs",
cl::desc("Enable register reassign optimizations on gfx10+"),
cl::init(true),
cl::Hidden);

static cl::opt<bool> OptVGPRLiveRange(
cl::opt<bool>
EnableLibCallSimplify("amdgpu-simplify-libcall",
cl::desc("Enable amdgpu library simplifications"),
cl::init(true), cl::Hidden);

cl::opt<bool> EnableLowerKernelArguments(
"amdgpu-ir-lower-kernel-arguments",
cl::desc("Lower kernel argument loads in IR pass"), cl::init(true),
cl::Hidden);

cl::opt<bool> EnableRegReassign(
"amdgpu-reassign-regs",
cl::desc("Enable register reassign optimizations on gfx10+"),
cl::init(true), cl::Hidden);

cl::opt<bool> OptVGPRLiveRange(
"amdgpu-opt-vgpr-liverange",
cl::desc("Enable VGPR liverange optimizations for if-else structure"),
cl::init(true), cl::Hidden);

static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
"amdgpu-atomic-optimizer-strategy",
cl::desc("Select DPP or Iterative strategy for scan"),
cl::init(ScanOptions::Iterative),
Expand All @@ -298,91 +286,85 @@ static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));

// Enable Mode register optimization
static cl::opt<bool> EnableSIModeRegisterPass(
"amdgpu-mode-register",
cl::desc("Enable mode register pass"),
cl::init(true),
cl::Hidden);
cl::opt<bool> EnableSIModeRegisterPass("amdgpu-mode-register",
cl::desc("Enable mode register pass"),
cl::init(true), cl::Hidden);

// Enable GFX11.5+ s_singleuse_vdst insertion
static cl::opt<bool>
cl::opt<bool>
EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
cl::desc("Enable s_singleuse_vdst insertion"),
cl::init(false), cl::Hidden);

// Enable GFX11+ s_delay_alu insertion
static cl::opt<bool>
EnableInsertDelayAlu("amdgpu-enable-delay-alu",
cl::desc("Enable s_delay_alu insertion"),
cl::init(true), cl::Hidden);
cl::opt<bool> EnableInsertDelayAlu("amdgpu-enable-delay-alu",
cl::desc("Enable s_delay_alu insertion"),
cl::init(true), cl::Hidden);

// Enable GFX11+ VOPD
static cl::opt<bool>
EnableVOPD("amdgpu-enable-vopd",
cl::desc("Enable VOPD, dual issue of VALU in wave32"),
cl::init(true), cl::Hidden);
cl::opt<bool> EnableVOPD("amdgpu-enable-vopd",
cl::desc("Enable VOPD, dual issue of VALU in wave32"),
cl::init(true), cl::Hidden);

// Option is used in lit tests to prevent deadcoding of patterns inspected.
static cl::opt<bool>
EnableDCEInRA("amdgpu-dce-in-ra",
cl::init(true), cl::Hidden,
cl::desc("Enable machine DCE inside regalloc"));
cl::opt<bool> EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden,
cl::desc("Enable machine DCE inside regalloc"));

static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
cl::desc("Adjust wave priority"),
cl::init(false), cl::Hidden);
cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
cl::desc("Adjust wave priority"),
cl::init(false), cl::Hidden);

static cl::opt<bool> EnableScalarIRPasses(
"amdgpu-scalar-ir-passes",
cl::desc("Enable scalar IR passes"),
cl::init(true),
cl::Hidden);
cl::opt<bool> EnableScalarIRPasses("amdgpu-scalar-ir-passes",
cl::desc("Enable scalar IR passes"),
cl::init(true), cl::Hidden);

static cl::opt<bool, true> EnableStructurizerWorkarounds(
cl::opt<bool, true> EnableStructurizerWorkarounds(
"amdgpu-enable-structurizer-workarounds",
cl::desc("Enable workarounds for the StructurizeCFG pass"),
cl::location(AMDGPUTargetMachine::EnableStructurizerWorkarounds),
cl::init(true), cl::Hidden);

static cl::opt<bool, true> EnableLowerModuleLDS(
cl::opt<bool, true> EnableLowerModuleLDS(
"amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
cl::Hidden);

static cl::opt<bool> EnablePreRAOptimizations(
"amdgpu-enable-pre-ra-optimizations",
cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
cl::Hidden);
cl::opt<bool>
EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations",
cl::desc("Enable Pre-RA optimizations pass"),
cl::init(true), cl::Hidden);

static cl::opt<bool> EnablePromoteKernelArguments(
cl::opt<bool> EnablePromoteKernelArguments(
"amdgpu-enable-promote-kernel-arguments",
cl::desc("Enable promotion of flat kernel pointer arguments to global"),
cl::Hidden, cl::init(true));

static cl::opt<bool> EnableImageIntrinsicOptimizer(
cl::opt<bool> EnableImageIntrinsicOptimizer(
"amdgpu-enable-image-intrinsic-optimizer",
cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
cl::Hidden);

static cl::opt<bool>
cl::opt<bool>
EnableLoopPrefetch("amdgpu-loop-prefetch",
cl::desc("Enable loop data prefetch on AMDGPU"),
cl::Hidden, cl::init(false));

static cl::opt<bool> EnableMaxIlpSchedStrategy(
cl::opt<bool> EnableMaxIlpSchedStrategy(
"amdgpu-enable-max-ilp-scheduling-strategy",
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
cl::Hidden, cl::init(false));

static cl::opt<bool> EnableRewritePartialRegUses(
cl::opt<bool> EnableRewritePartialRegUses(
"amdgpu-enable-rewrite-partial-reg-uses",
cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
cl::Hidden);

static cl::opt<bool> EnableHipStdPar(
"amdgpu-enable-hipstdpar",
cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
cl::Hidden);
cl::opt<bool>
EnableHipStdPar("amdgpu-enable-hipstdpar",
cl::desc("Enable HIP Standard Parallelism Offload support"),
cl::init(false), cl::Hidden);

} // namespace llvm::AMDGPU

static cl::opt<bool>
EnableAMDGPUAttributor("amdgpu-attributor-enable",
Expand Down
41 changes: 41 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,53 @@

#include "GCNSubtarget.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
#include <optional>
#include <utility>

namespace llvm {

enum class ScanOptions;

namespace AMDGPU {

extern cl::opt<bool> EnableEarlyIfConversion;
extern cl::opt<bool> OptExecMaskPreRA;
extern cl::opt<bool> LowerCtorDtor;
extern cl::opt<bool> EnableLoadStoreVectorizer;
extern cl::opt<bool> ScalarizeGlobal;
extern cl::opt<bool> InternalizeSymbols;
extern cl::opt<bool> EarlyInlineAll;
extern cl::opt<bool> RemoveIncompatibleFunctions;
extern cl::opt<bool> EnableSDWAPeephole;
extern cl::opt<bool> EnableDPPCombine;
extern cl::opt<bool> EnableAMDGPUAliasAnalysis;
extern cl::opt<bool, true> LateCFGStructurize;
extern cl::opt<bool, true> DisableStructurizer;
extern cl::opt<bool> EnableLibCallSimplify;
extern cl::opt<bool> EnableLowerKernelArguments;
extern cl::opt<bool> EnableRegReassign;
extern cl::opt<bool> OptVGPRLiveRange;
extern cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy;
extern cl::opt<bool> EnableSIModeRegisterPass;
extern cl::opt<bool> EnableInsertSingleUseVDST;
extern cl::opt<bool> EnableInsertDelayAlu;
extern cl::opt<bool> EnableVOPD;
extern cl::opt<bool> EnableDCEInRA;
extern cl::opt<bool> EnableSetWavePriority;
extern cl::opt<bool> EnableScalarIRPasses;
extern cl::opt<bool, true> EnableStructurizerWorkarounds;
extern cl::opt<bool, true> EnableLowerModuleLDS;
extern cl::opt<bool> EnablePreRAOptimizations;
extern cl::opt<bool> EnablePromoteKernelArguments;
extern cl::opt<bool> EnableImageIntrinsicOptimizer;
extern cl::opt<bool> EnableLoopPrefetch;
extern cl::opt<bool> EnableMaxIlpSchedStrategy;
extern cl::opt<bool> EnableRewritePartialRegUses;
extern cl::opt<bool> EnableHipStdPar;
} // namespace AMDGPU

//===----------------------------------------------------------------------===//
// AMDGPU Target Machine (R600+)
//===----------------------------------------------------------------------===//
Expand Down
Loading