llvm · arsenm · Aug 11, 2024 · Aug 12, 2024
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Transforms/Scalar/StructurizeCFG.h"
 #include "llvm/Transforms/Utils/FixIrreducible.h"
 #include "llvm/Transforms/Utils/LCSSA.h"
+#include "llvm/Transforms/Utils/LowerSwitch.h"
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
 
 using namespace llvm;
@@ -35,6 +36,16 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
               ShadowStackGCLoweringPass>();
 }
 
+void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
+  Base::addCodeGenPrepare(addPass);
+
+  // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
+  // behavior for subsequent passes. Placing it here seems better that these
+  // blocks would get cleaned up by UnreachableBlockElim inserted next in the
+  // pass flow.
+  addPass(LowerSwitchPass());
+}
+
 void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
   const bool LateCFGStructurize = AMDGPUTargetMachine::EnableLateStructurizeCFG;
   const bool DisableStructurizer = AMDGPUTargetMachine::DisableStructurizer;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h
@@ -19,10 +19,12 @@ class GCNTargetMachine;
 class AMDGPUCodeGenPassBuilder
     : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
 public:
+  using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
+
   AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
                            const CGPassBuilderOption &Opts,
                            PassInstrumentationCallbacks *PIC);
-
+  void addCodeGenPrepare(AddIRPass &) const;
   void addPreISel(AddIRPass &addPass) const;
   void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
   Error addInstSelector(AddMachinePass &) const;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -67,12 +67,14 @@
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LowerSwitch.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include <optional>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
+using namespace llvm::AMDGPU;
 
 namespace {
 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
@@ -185,109 +187,95 @@ static VGPRRegisterRegAlloc fastRegAllocVGPR(
   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
 } // anonymous namespace
 
-static cl::opt<bool>
-EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
-                        cl::desc("Run early if-conversion"),
-                        cl::init(false));
+namespace llvm::AMDGPU {
+cl::opt<bool> EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+                                      cl::desc("Run early if-conversion"),
+                                      cl::init(false));
 
-static cl::opt<bool>
-OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
-            cl::desc("Run pre-RA exec mask optimizations"),
-            cl::init(true));
+cl::opt<bool> OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
+                               cl::desc("Run pre-RA exec mask optimizations"),
+                               cl::init(true));
 
-static cl::opt<bool>
+cl::opt<bool>
     LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
                   cl::desc("Lower GPU ctor / dtors to globals on the device."),
                   cl::init(true), cl::Hidden);
 
 // Option to disable vectorizer for tests.
-static cl::opt<bool> EnableLoadStoreVectorizer(
-  "amdgpu-load-store-vectorizer",
-  cl::desc("Enable load store vectorizer"),
-  cl::init(true),
-  cl::Hidden);
+cl::opt<bool>
+    EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer",
+                              cl::desc("Enable load store vectorizer"),
+                              cl::init(true), cl::Hidden);
 
 // Option to control global loads scalarization
-static cl::opt<bool> ScalarizeGlobal(
-  "amdgpu-scalarize-global-loads",
-  cl::desc("Enable global load scalarization"),
-  cl::init(true),
-  cl::Hidden);
+cl::opt<bool> ScalarizeGlobal("amdgpu-scalarize-global-loads",
+                              cl::desc("Enable global load scalarization"),
+                              cl::init(true), cl::Hidden);
 
 // Option to run internalize pass.
-static cl::opt<bool> InternalizeSymbols(
-  "amdgpu-internalize-symbols",
-  cl::desc("Enable elimination of non-kernel functions and unused globals"),
-  cl::init(false),
-  cl::Hidden);
+cl::opt<bool> InternalizeSymbols(
+    "amdgpu-internalize-symbols",
+    cl::desc("Enable elimination of non-kernel functions and unused globals"),
+    cl::init(false), cl::Hidden);
 
 // Option to inline all early.
-static cl::opt<bool> EarlyInlineAll(
-  "amdgpu-early-inline-all",
-  cl::desc("Inline all functions early"),
-  cl::init(false),
-  cl::Hidden);
+cl::opt<bool> EarlyInlineAll("amdgpu-early-inline-all",
+                             cl::desc("Inline all functions early"),
+                             cl::init(false), cl::Hidden);
 
-static cl::opt<bool> RemoveIncompatibleFunctions(
+cl::opt<bool> RemoveIncompatibleFunctions(
     "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
     cl::desc("Enable removal of functions when they"
              "use features not supported by the target GPU"),
     cl::init(true));
 
-static cl::opt<bool> EnableSDWAPeephole(
-  "amdgpu-sdwa-peephole",
-  cl::desc("Enable SDWA peepholer"),
-  cl::init(true));
+cl::opt<bool> EnableSDWAPeephole("amdgpu-sdwa-peephole",
+                                 cl::desc("Enable SDWA peepholer"),
+                                 cl::init(true));
 
-static cl::opt<bool> EnableDPPCombine(
-  "amdgpu-dpp-combine",
-  cl::desc("Enable DPP combiner"),
-  cl::init(true));
+cl::opt<bool> EnableDPPCombine("amdgpu-dpp-combine",
+                               cl::desc("Enable DPP combiner"), cl::init(true));
 
 // Enable address space based alias analysis
-static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
-  cl::desc("Enable AMDGPU Alias Analysis"),
-  cl::init(true));
+cl::opt<bool>
+    EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+                              cl::desc("Enable AMDGPU Alias Analysis"),
+                              cl::init(true));
 
 // Option to run late CFG structurizer
-static cl::opt<bool, true> LateCFGStructurize(
-  "amdgpu-late-structurize",
-  cl::desc("Enable late CFG structurization"),
-  cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
-  cl::Hidden);
+cl::opt<bool, true> LateCFGStructurize(
+    "amdgpu-late-structurize", cl::desc("Enable late CFG structurization"),
+    cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden);
 
 // Disable structurizer-based control-flow lowering in order to test convergence
 // control tokens. This should eventually be replaced by the wave-transform.
-static cl::opt<bool, true> DisableStructurizer(
+cl::opt<bool, true> DisableStructurizer(
     "amdgpu-disable-structurizer",
     cl::desc("Disable structurizer for experiments; produces unusable code"),
     cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden);
 
 // Enable lib calls simplifications
-static cl::opt<bool> EnableLibCallSimplify(
-  "amdgpu-simplify-libcall",
-  cl::desc("Enable amdgpu library simplifications"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> EnableLowerKernelArguments(
-  "amdgpu-ir-lower-kernel-arguments",
-  cl::desc("Lower kernel argument loads in IR pass"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> EnableRegReassign(
-  "amdgpu-reassign-regs",
-  cl::desc("Enable register reassign optimizations on gfx10+"),
-  cl::init(true),
-  cl::Hidden);
-
-static cl::opt<bool> OptVGPRLiveRange(
+cl::opt<bool>
+    EnableLibCallSimplify("amdgpu-simplify-libcall",
+                          cl::desc("Enable amdgpu library simplifications"),
+                          cl::init(true), cl::Hidden);
+
+cl::opt<bool> EnableLowerKernelArguments(
+    "amdgpu-ir-lower-kernel-arguments",
+    cl::desc("Lower kernel argument loads in IR pass"), cl::init(true),
+    cl::Hidden);
+
+cl::opt<bool> EnableRegReassign(
+    "amdgpu-reassign-regs",
+    cl::desc("Enable register reassign optimizations on gfx10+"),
+    cl::init(true), cl::Hidden);
+
+cl::opt<bool> OptVGPRLiveRange(
     "amdgpu-opt-vgpr-liverange",
     cl::desc("Enable VGPR liverange optimizations for if-else structure"),
     cl::init(true), cl::Hidden);
 
-static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
+cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
     "amdgpu-atomic-optimizer-strategy",
     cl::desc("Select DPP or Iterative strategy for scan"),
     cl::init(ScanOptions::Iterative),
@@ -298,91 +286,85 @@ static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
         clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
 
 // Enable Mode register optimization
-static cl::opt<bool> EnableSIModeRegisterPass(
-  "amdgpu-mode-register",
-  cl::desc("Enable mode register pass"),
-  cl::init(true),
-  cl::Hidden);
+cl::opt<bool> EnableSIModeRegisterPass("amdgpu-mode-register",
+                                       cl::desc("Enable mode register pass"),
+                                       cl::init(true), cl::Hidden);
 
 // Enable GFX11.5+ s_singleuse_vdst insertion
-static cl::opt<bool>
+cl::opt<bool>
     EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
                               cl::desc("Enable s_singleuse_vdst insertion"),
                               cl::init(false), cl::Hidden);
 
 // Enable GFX11+ s_delay_alu insertion
-static cl::opt<bool>
-    EnableInsertDelayAlu("amdgpu-enable-delay-alu",
-                         cl::desc("Enable s_delay_alu insertion"),
-                         cl::init(true), cl::Hidden);
+cl::opt<bool> EnableInsertDelayAlu("amdgpu-enable-delay-alu",
+                                   cl::desc("Enable s_delay_alu insertion"),
+                                   cl::init(true), cl::Hidden);
 
 // Enable GFX11+ VOPD
-static cl::opt<bool>
-    EnableVOPD("amdgpu-enable-vopd",
-               cl::desc("Enable VOPD, dual issue of VALU in wave32"),
-               cl::init(true), cl::Hidden);
+cl::opt<bool> EnableVOPD("amdgpu-enable-vopd",
+                         cl::desc("Enable VOPD, dual issue of VALU in wave32"),
+                         cl::init(true), cl::Hidden);
 
 // Option is used in lit tests to prevent deadcoding of patterns inspected.
-static cl::opt<bool>
-EnableDCEInRA("amdgpu-dce-in-ra",
-    cl::init(true), cl::Hidden,
-    cl::desc("Enable machine DCE inside regalloc"));
+cl::opt<bool> EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden,
+                            cl::desc("Enable machine DCE inside regalloc"));
 
-static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
-                                           cl::desc("Adjust wave priority"),
-                                           cl::init(false), cl::Hidden);
+cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
+                                    cl::desc("Adjust wave priority"),
+                                    cl::init(false), cl::Hidden);
 
-static cl::opt<bool> EnableScalarIRPasses(
-  "amdgpu-scalar-ir-passes",
-  cl::desc("Enable scalar IR passes"),
-  cl::init(true),
-  cl::Hidden);
+cl::opt<bool> EnableScalarIRPasses("amdgpu-scalar-ir-passes",
+                                   cl::desc("Enable scalar IR passes"),
+                                   cl::init(true), cl::Hidden);
 
-static cl::opt<bool, true> EnableStructurizerWorkarounds(
+cl::opt<bool, true> EnableStructurizerWorkarounds(
     "amdgpu-enable-structurizer-workarounds",
     cl::desc("Enable workarounds for the StructurizeCFG pass"),
     cl::location(AMDGPUTargetMachine::EnableStructurizerWorkarounds),
     cl::init(true), cl::Hidden);
 
-static cl::opt<bool, true> EnableLowerModuleLDS(
+cl::opt<bool, true> EnableLowerModuleLDS(
     "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnablePreRAOptimizations(
-    "amdgpu-enable-pre-ra-optimizations",
-    cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
-    cl::Hidden);
+cl::opt<bool>
+    EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations",
+                             cl::desc("Enable Pre-RA optimizations pass"),
+                             cl::init(true), cl::Hidden);
 
-static cl::opt<bool> EnablePromoteKernelArguments(
+cl::opt<bool> EnablePromoteKernelArguments(
     "amdgpu-enable-promote-kernel-arguments",
     cl::desc("Enable promotion of flat kernel pointer arguments to global"),
     cl::Hidden, cl::init(true));
 
-static cl::opt<bool> EnableImageIntrinsicOptimizer(
+cl::opt<bool> EnableImageIntrinsicOptimizer(
     "amdgpu-enable-image-intrinsic-optimizer",
     cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool>
+cl::opt<bool>
     EnableLoopPrefetch("amdgpu-loop-prefetch",
                        cl::desc("Enable loop data prefetch on AMDGPU"),
                        cl::Hidden, cl::init(false));
 
-static cl::opt<bool> EnableMaxIlpSchedStrategy(
+cl::opt<bool> EnableMaxIlpSchedStrategy(
     "amdgpu-enable-max-ilp-scheduling-strategy",
     cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
     cl::Hidden, cl::init(false));
 
-static cl::opt<bool> EnableRewritePartialRegUses(
+cl::opt<bool> EnableRewritePartialRegUses(
     "amdgpu-enable-rewrite-partial-reg-uses",
     cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnableHipStdPar(
-  "amdgpu-enable-hipstdpar",
-  cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
-  cl::Hidden);
+cl::opt<bool>
+    EnableHipStdPar("amdgpu-enable-hipstdpar",
+                    cl::desc("Enable HIP Standard Parallelism Offload support"),
+                    cl::init(false), cl::Hidden);
+
+} // namespace llvm::AMDGPU
 
 static cl::opt<bool>
     EnableAMDGPUAttributor("amdgpu-attributor-enable",

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -16,12 +16,53 @@
 
 #include "GCNSubtarget.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
 #include <optional>
 #include <utility>
 
 namespace llvm {
 
+enum class ScanOptions;
+
+namespace AMDGPU {
+
+extern cl::opt<bool> EnableEarlyIfConversion;
+extern cl::opt<bool> OptExecMaskPreRA;
+extern cl::opt<bool> LowerCtorDtor;
+extern cl::opt<bool> EnableLoadStoreVectorizer;
+extern cl::opt<bool> ScalarizeGlobal;
+extern cl::opt<bool> InternalizeSymbols;
+extern cl::opt<bool> EarlyInlineAll;
+extern cl::opt<bool> RemoveIncompatibleFunctions;
+extern cl::opt<bool> EnableSDWAPeephole;
+extern cl::opt<bool> EnableDPPCombine;
+extern cl::opt<bool> EnableAMDGPUAliasAnalysis;
+extern cl::opt<bool, true> LateCFGStructurize;
+extern cl::opt<bool, true> DisableStructurizer;
+extern cl::opt<bool> EnableLibCallSimplify;
+extern cl::opt<bool> EnableLowerKernelArguments;
+extern cl::opt<bool> EnableRegReassign;
+extern cl::opt<bool> OptVGPRLiveRange;
+extern cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy;
+extern cl::opt<bool> EnableSIModeRegisterPass;
+extern cl::opt<bool> EnableInsertSingleUseVDST;
+extern cl::opt<bool> EnableInsertDelayAlu;
+extern cl::opt<bool> EnableVOPD;
+extern cl::opt<bool> EnableDCEInRA;
+extern cl::opt<bool> EnableSetWavePriority;
+extern cl::opt<bool> EnableScalarIRPasses;
+extern cl::opt<bool, true> EnableStructurizerWorkarounds;
+extern cl::opt<bool, true> EnableLowerModuleLDS;
+extern cl::opt<bool> EnablePreRAOptimizations;
+extern cl::opt<bool> EnablePromoteKernelArguments;
+extern cl::opt<bool> EnableImageIntrinsicOptimizer;
+extern cl::opt<bool> EnableLoopPrefetch;
+extern cl::opt<bool> EnableMaxIlpSchedStrategy;
+extern cl::opt<bool> EnableRewritePartialRegUses;
+extern cl::opt<bool> EnableHipStdPar;
+} // namespace AMDGPU
+
 //===----------------------------------------------------------------------===//
 // AMDGPU Target Machine (R600+)
 //===----------------------------------------------------------------------===//