[AMDGPU][NewPM] Port AMDGPUInsertDelayAlu to NPM #128003

optimisan · 2025-02-20T13:01:04Z

No description provided.

llvmbot · 2025-02-20T13:01:40Z

@llvm/pr-subscribers-backend-amdgpu

Author: Akshat Oke (optimisan)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/128003.diff

7 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+7-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp (+34-18)
(modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (+1-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+41-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h (+1)
(modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+1)
(modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir (+1)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 42392e22643b2..1444115b98d65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -270,6 +270,12 @@ struct AMDGPUAtomicOptimizerPass : PassInfoMixin<AMDGPUAtomicOptimizerPass> {
   ScanOptions ScanImpl;
 };
 
+struct AMDGPUInsertDelayAluPass
+    : public PassInfoMixin<AMDGPUInsertDelayAluPass> {
+  PreservedAnalyses run(MachineFunction &F,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel);
 ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
@@ -416,7 +422,7 @@ extern char &SIMemoryLegalizerID;
 void initializeSIModeRegisterPass(PassRegistry&);
 extern char &SIModeRegisterID;
 
-void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
+void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &);
 extern char &AMDGPUInsertDelayAluID;
 
 void initializeSIInsertHardClausesPass(PassRegistry &);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 3f2bb5df8836b..b3e371cdff8fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -23,22 +23,13 @@ using namespace llvm;
 
 namespace {
 
-class AMDGPUInsertDelayAlu : public MachineFunctionPass {
+class AMDGPUInsertDelayAlu {
 public:
-  static char ID;
-
   const SIInstrInfo *SII;
   const TargetRegisterInfo *TRI;
 
   const TargetSchedModel *SchedModel;
 
-  AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
   // Return true if MI waits for all outstanding VALU instructions to complete.
   static bool instructionWaitsForVALU(const MachineInstr &MI) {
     // These instruction types wait for VA_VDST==0 before issuing.
@@ -416,10 +407,7 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
     return Changed;
   }
 
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    if (skipFunction(MF.getFunction()))
-      return false;
-
+  bool run(MachineFunction &MF) {
     LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
                       << "\n");
 
@@ -454,11 +442,39 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
   }
 };
 
+class AMDGPUInsertDelayAluLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUInsertDelayAluLegacy() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+    AMDGPUInsertDelayAlu Impl;
+    return Impl.run(MF);
+  }
+};
 } // namespace
 
-char AMDGPUInsertDelayAlu::ID = 0;
+PreservedAnalyses
+AMDGPUInsertDelayAluPass::run(MachineFunction &MF,
+                              MachineFunctionAnalysisManager &MFAM) {
+  if (!AMDGPUInsertDelayAlu().run(MF))
+    return PreservedAnalyses::all();
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+} // end namespace llvm
+
+char AMDGPUInsertDelayAluLegacy::ID = 0;
 
-char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
+char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAluLegacy::ID;
 
-INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
-                false, false)
+INITIALIZE_PASS(AMDGPUInsertDelayAluLegacy, DEBUG_TYPE,
+                "AMDGPU Insert Delay ALU", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index fd1341e8c91b2..468fbff31233c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -96,6 +96,7 @@ FUNCTION_PASS_WITH_PARAMS(
 #ifndef MACHINE_FUNCTION_PASS
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass())
 MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
 MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass())
 MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass())
@@ -120,7 +121,6 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass())
 #undef MACHINE_FUNCTION_PASS
 
 #define DUMMY_MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
-DUMMY_MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass())
 DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
 DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass())
 DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7c9377e61230b..f8431cbf05489 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -531,7 +531,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
-  initializeAMDGPUInsertDelayAluPass(*PR);
+  initializeAMDGPUInsertDelayAluLegacyPass(*PR);
   initializeSIInsertHardClausesPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
   initializeSIModeRegisterPass(*PR);
@@ -2145,6 +2145,46 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
   Base::addPostRegAlloc(addPass);
 }
 
+void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
+  if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
+    // TODO: addPass(GCNCreateVOPDPass());
+  }
+  // TODO: addPass(SIMemoryLegalizerPass());
+  // TODO: addPass(SIInsertWaitcntsPass());
+
+  // TODO: addPass(SIModeRegisterPass());
+
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
+    // TODO: addPass(SIInsertHardClausesPass());
+  }
+
+  // addPass(SILateBranchLoweringPass());
+  if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) {
+    // TODO: addPass(AMDGPUSetWavePriorityPass());
+  }
+
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
+    // TODO: addPass(SIPreEmitPeepholePass());
+  }
+
+  // The hazard recognizer that runs as part of the post-ra scheduler does not
+  // guarantee to be able handle all hazards correctly. This is because if there
+  // are multiple scheduling regions in a basic block, the regions are scheduled
+  // bottom up, so when we begin to schedule a region we don't know what
+  // instructions were emitted directly before it.
+  //
+  // Here we add a stand-alone hazard recognizer pass which can handle all
+  // cases.
+  // TODO: addPass(PostRAHazardRecognizerPass());
+  addPass(AMDGPUWaitSGPRHazardsPass());
+
+  if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) {
+    addPass(AMDGPUInsertDelayAluPass());
+  }
+
+  // TODO: addPass(BranchRelaxationPass());
+}
+
 bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
                                              CodeGenOptLevel Level) const {
   if (Opt.getNumOccurrences())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index eb5a9ca1f86d6..3df4115324ac2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -180,6 +180,7 @@ class AMDGPUCodeGenPassBuilder
   void addPreRewrite(AddMachinePass &) const;
   void addMachineSSAOptimization(AddMachinePass &) const;
   void addPostRegAlloc(AddMachinePass &) const;
+  void addPreEmitPass(AddMachinePass &) const;
 
   /// Check if a pass is enabled given \p Opt option. The option always
   /// overrides defaults if explicitly used. Otherwise its default will be used
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index fa15e73bc31d5..88092121156e6 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -2376,6 +2376,7 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
 
 PreservedAnalyses SIFoldOperandsPass::run(MachineFunction &MF,
                                           MachineFunctionAnalysisManager &) {
+  MFPropsModifier _(*this, MF);
   bool Changed = SIFoldOperandsImpl().run(MF);
   if (!Changed) {
     return PreservedAnalyses::all();
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
index 266da50f6e543..ea8c7c956f776 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -passes=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1

llvm-ci · 2025-02-26T04:29:43Z

LLVM Buildbot has detected a new failure on builder clang-aarch64-quick running on linaro-clang-aarch64-quick while building llvm at step 5 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/12798

Here is the relevant piece of the build log for the reference

Step 5 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'lit :: googletest-timeout.py' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 9
not env -u FILECHECK_OPTS "/usr/bin/python3.10" /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical -v Inputs/googletest-timeout    --param gtest_filter=InfiniteLoopSubTest --timeout=1 > /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/Output/googletest-timeout.py.tmp.cmd.out
# executed command: not env -u FILECHECK_OPTS /usr/bin/python3.10 /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical -v Inputs/googletest-timeout --param gtest_filter=InfiniteLoopSubTest --timeout=1
# .---command stderr------------
# | lit.py: /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit/main.py:72: note: The test suite configuration requested an individual test timeout of 0 seconds but a timeout of 1 seconds was requested on the command line. Forcing timeout to be 1 seconds.
# `-----------------------------
# RUN: at line 11
FileCheck --check-prefix=CHECK-INF < /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/Output/googletest-timeout.py.tmp.cmd.out /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py
# executed command: FileCheck --check-prefix=CHECK-INF /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py
# .---command stderr------------
# | /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py:34:14: error: CHECK-INF: expected string not found in input
# | # CHECK-INF: Timed Out: 1
# |              ^
# | <stdin>:13:29: note: scanning from here
# | Reached timeout of 1 seconds
# |                             ^
# | <stdin>:37:2: note: possible intended match here
# |  Timed Out: 2 (100.00%)
# |  ^
# | 
# | Input file: <stdin>
# | Check file: /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             .
# |             .
# |             .
# |             8:  
# |             9:  
# |            10: -- 
# |            11: exit: -9 
# |            12: -- 
# |            13: Reached timeout of 1 seconds 
# | check:34'0                                 X error: no match found
# |            14: ******************** 
# | check:34'0     ~~~~~~~~~~~~~~~~~~~~~
# |            15: TIMEOUT: googletest-timeout :: DummySubDir/OneTest.py/1/2 (2 of 2) 
# | check:34'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            16: ******************** TEST 'googletest-timeout :: DummySubDir/OneTest.py/1/2' FAILED ******************** 
# | check:34'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            17: Script(shard): 
# | check:34'0     ~~~~~~~~~~~~~~~
...

llvmbot added the backend:AMDGPU label Feb 20, 2025

optimisan requested review from arsenm, pravinjagtap, cdevadas and vikramRH February 20, 2025 13:01

[AMDGPU][NewPM] Port AMDGPUInsertDelayAlu to NPM

b471246

optimisan force-pushed the port-amdgpu-insert-delay branch from 1f945ff to b471246 Compare February 20, 2025 13:02

arsenm approved these changes Feb 20, 2025

View reviewed changes

optimisan merged commit 8529238 into llvm:main Feb 26, 2025
8 checks passed

optimisan deleted the port-amdgpu-insert-delay branch February 26, 2025 04:20

huaatian mentioned this pull request Feb 28, 2025

fix live interval empty issue huaatian/llvm-project#1

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][NewPM] Port AMDGPUInsertDelayAlu to NPM #128003

[AMDGPU][NewPM] Port AMDGPUInsertDelayAlu to NPM #128003

Uh oh!

optimisan commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

Uh oh!

llvm-ci commented Feb 26, 2025

Uh oh!

Uh oh!

[AMDGPU][NewPM] Port AMDGPUInsertDelayAlu to NPM #128003

[AMDGPU][NewPM] Port AMDGPUInsertDelayAlu to NPM #128003

Uh oh!

Conversation

optimisan commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

Uh oh!

llvm-ci commented Feb 26, 2025

Uh oh!

Uh oh!