[RISCV] Add a MIR pass to reassociate shXadd, add, and slli to form more shXadd.

topperc · topperc · commit 5d24969f1e4f · 2024-04-05T12:51:43.000-07:00
This reassociates patterns like (sh3add Z, (add X, (slli Y, 6))) into (sh3add (sh3add Y, Z), X). This improves a pattern that occurs in 531.deepsjeng_r. Reducing the dynamic instruction count by 0.5%. This may be possible to improve in SelectionDAG, but given the special cases around shXadd formation, it's not obvious it can be done in a robust way without adding multiple special cases. I've used a GEP with 2 indices because that mostly closely resembles the motivating case. Most of the test cases are the simplest GEP case. One test has a logical right shift on an index which is closer to the deepsjeng code. This requires special handling in isel to reverse a DAGCombiner canonicalization that turns a pair of shifts into (srl (and X, C1), C2). See also #85734 which had a hacky version of a similar optimization.
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -46,6 +46,7 @@ add_llvm_target(RISCVCodeGen
   RISCVMachineFunctionInfo.cpp
   RISCVMergeBaseOffset.cpp
   RISCVOptWInstrs.cpp
+  RISCVOptZba.cpp
   RISCVPostRAExpandPseudoInsts.cpp
   RISCVRedundantCopyElimination.cpp
   RISCVMoveMerger.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
@@ -46,6 +46,9 @@ void initializeRISCVFoldMasksPass(PassRegistry &);
 FunctionPass *createRISCVOptWInstrsPass();
 void initializeRISCVOptWInstrsPass(PassRegistry &);
 
+FunctionPass *createRISCVOptZbaPass();
+void initializeRISCVOptZbaPass(PassRegistry &);
+
 FunctionPass *createRISCVMergeBaseOffsetOptPass();
 void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCVOptZba.cpp b/llvm/lib/Target/RISCV/RISCVOptZba.cpp
@@ -0,0 +1,170 @@
+//===- RISCVOptZba.cpp - MI Zba instruction optimizations -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass reassociates expressions like
+//   (sh3add Z, (add X, (slli Y, 5)))
+// To
+//   (sh3add (sh2add Y, Z), X)
+//
+// If the shift amount is small enough. The outer shXadd keeps its original
+// opcode. The inner shXadd shift amount is the difference between the slli
+// shift amount and the outer shXadd shift amount.
+//
+// This pattern can appear when indexing a two dimensional array, but it is not
+// limited to that.
+//
+// TODO: We can also support slli.uw by using shXadd.uw for the inner shXadd.
+// TODO: This can be generalized to deeper expressions.
+//
+//===---------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-opt-zba"
+#define RISCV_OPT_ZBA_NAME "RISC-V Optimize Zba"
+
+namespace {
+
+class RISCVOptZba : public MachineFunctionPass {
+public:
+  static char ID;
+
+  RISCVOptZba() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return RISCV_OPT_ZBA_NAME; }
+};
+
+} // end anonymous namespace
+
+char RISCVOptZba::ID = 0;
+INITIALIZE_PASS(RISCVOptZba, DEBUG_TYPE, RISCV_OPT_ZBA_NAME, false, false)
+
+FunctionPass *llvm::createRISCVOptZbaPass() { return new RISCVOptZba(); }
+
+static MachineInstr *findShift(Register Reg, const MachineBasicBlock &MBB,
+                               MachineRegisterInfo &MRI) {
+  if (!Reg.isVirtual())
+    return nullptr;
+
+  MachineInstr *Shift = MRI.getVRegDef(Reg);
+  if (!Shift || Shift->getOpcode() != RISCV::SLLI ||
+      Shift->getParent() != &MBB || !MRI.hasOneNonDBGUse(Reg))
+    return nullptr;
+
+  return Shift;
+}
+
+bool RISCVOptZba::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+  const RISCVInstrInfo &TII = *ST.getInstrInfo();
+
+  if (!ST.hasStdExtZba())
+    return false;
+
+  bool MadeChange = true;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+      unsigned OuterShiftAmt;
+      switch (MI.getOpcode()) {
+      default:
+        continue;
+      case RISCV::SH1ADD:
+        OuterShiftAmt = 1;
+        break;
+      case RISCV::SH2ADD:
+        OuterShiftAmt = 2;
+        break;
+      case RISCV::SH3ADD:
+        OuterShiftAmt = 3;
+        break;
+      }
+
+      // Second operand must be virtual.
+      Register UnshiftedReg = MI.getOperand(2).getReg();
+      if (!UnshiftedReg.isVirtual())
+        continue;
+
+      MachineInstr *Add = MRI.getVRegDef(UnshiftedReg);
+      if (!Add || Add->getOpcode() != RISCV::ADD || Add->getParent() != &MBB ||
+          !MRI.hasOneNonDBGUse(UnshiftedReg))
+        continue;
+
+      Register AddReg0 = Add->getOperand(1).getReg();
+      Register AddReg1 = Add->getOperand(2).getReg();
+
+      MachineInstr *InnerShift;
+      Register X;
+      if ((InnerShift = findShift(AddReg0, MBB, MRI)))
+        X = AddReg1;
+      else if ((InnerShift = findShift(AddReg1, MBB, MRI)))
+        X = AddReg0;
+      else
+        continue;
+
+      unsigned InnerShiftAmt = InnerShift->getOperand(2).getImm();
+
+      // The inner shift amount must be at least as large as the outer shift
+      // amount.
+      if (OuterShiftAmt > InnerShiftAmt)
+        continue;
+
+      unsigned InnerOpc;
+      switch (InnerShiftAmt - OuterShiftAmt) {
+      default:
+        continue;
+      case 0:
+        InnerOpc = RISCV::ADD;
+        break;
+      case 1:
+        InnerOpc = RISCV::SH1ADD;
+        break;
+      case 2:
+        InnerOpc = RISCV::SH2ADD;
+        break;
+      case 3:
+        InnerOpc = RISCV::SH3ADD;
+        break;
+      }
+
+      Register Y = InnerShift->getOperand(1).getReg();
+      Register Z = MI.getOperand(1).getReg();
+
+      Register NewReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(InnerOpc), NewReg)
+          .addReg(Y)
+          .addReg(Z);
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(MI.getOpcode()),
+              MI.getOperand(0).getReg())
+          .addReg(NewReg)
+          .addReg(X);
+
+      MI.eraseFromParent();
+      Add->eraseFromParent();
+      InnerShift->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -117,6 +117,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVPostRAExpandPseudoPass(*PR);
   initializeRISCVMergeBaseOffsetOptPass(*PR);
   initializeRISCVOptWInstrsPass(*PR);
+  initializeRISCVOptZbaPass(*PR);
   initializeRISCVPreRAExpandPseudoPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
   initializeRISCVFoldMasksPass(*PR);
@@ -531,6 +532,8 @@ void RISCVPassConfig::addMachineSSAOptimization() {
   if (EnableMachineCombiner)
     addPass(&MachineCombinerID);
 
+  addPass(createRISCVOptZbaPass());
+
   if (TM->getTargetTriple().isRISCV64()) {
     addPass(createRISCVOptWInstrsPass());
   }
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -112,6 +112,7 @@
 ; CHECK-NEXT:       Machine Trace Metrics
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine InstCombiner
+; CHECK-NEXT:       RISC-V Optimize Zba
 ; RV64-NEXT:        RISC-V Optimize W Instructions
 ; CHECK-NEXT:       RISC-V Pre-RA pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V Merge Base Offset
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -1404,9 +1404,8 @@ define i64 @sh6_sh3_add2(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
 ;
 ; RV64ZBA-LABEL: sh6_sh3_add2:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    slli a1, a1, 6
-; RV64ZBA-NEXT:    add a0, a1, a0
-; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh3add a1, a1, a2
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
 ; RV64ZBA-NEXT:    ret
 entry:
   %shl = shl i64 %z, 3
@@ -2111,9 +2110,8 @@ define i64 @array_index_sh1_sh3(ptr %p, i64 %idx1, i64 %idx2) {
 ;
 ; RV64ZBA-LABEL: array_index_sh1_sh3:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a1, a1, 4
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a1, a1, a2
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
 ; RV64ZBA-NEXT:    ld a0, 0(a0)
 ; RV64ZBA-NEXT:    ret
   %a = getelementptr inbounds [2 x i64], ptr %p, i64 %idx1, i64 %idx2
@@ -2174,9 +2172,8 @@ define i32 @array_index_sh2_sh2(ptr %p, i64 %idx1, i64 %idx2) {
 ;
 ; RV64ZBA-LABEL: array_index_sh2_sh2:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a1, a1, 4
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh2add a1, a1, a2
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
 ; RV64ZBA-NEXT:    lw a0, 0(a0)
 ; RV64ZBA-NEXT:    ret
   %a = getelementptr inbounds [4 x i32], ptr %p, i64 %idx1, i64 %idx2
@@ -2196,9 +2193,8 @@ define i64 @array_index_sh2_sh3(ptr %p, i64 %idx1, i64 %idx2) {
 ;
 ; RV64ZBA-LABEL: array_index_sh2_sh3:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a1, a1, 5
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh2add a1, a1, a2
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
 ; RV64ZBA-NEXT:    ld a0, 0(a0)
 ; RV64ZBA-NEXT:    ret
   %a = getelementptr inbounds [4 x i64], ptr %p, i64 %idx1, i64 %idx2
@@ -2238,9 +2234,8 @@ define i16 @array_index_sh3_sh1(ptr %p, i64 %idx1, i64 %idx2) {
 ;
 ; RV64ZBA-LABEL: array_index_sh3_sh1:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a1, a1, 4
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh3add a1, a1, a2
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
 ; RV64ZBA-NEXT:    lh a0, 0(a0)
 ; RV64ZBA-NEXT:    ret
   %a = getelementptr inbounds [8 x i16], ptr %p, i64 %idx1, i64 %idx2
@@ -2260,9 +2255,8 @@ define i32 @array_index_sh3_sh2(ptr %p, i64 %idx1, i64 %idx2) {
 ;
 ; RV64ZBA-LABEL: array_index_sh3_sh2:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a1, a1, 5
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh3add a1, a1, a2
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
 ; RV64ZBA-NEXT:    lw a0, 0(a0)
 ; RV64ZBA-NEXT:    ret
   %a = getelementptr inbounds [8 x i32], ptr %p, i64 %idx1, i64 %idx2
@@ -2282,9 +2276,8 @@ define i64 @array_index_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
 ;
 ; RV64ZBA-LABEL: array_index_sh3_sh3:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a1, a1, 6
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh3add a1, a1, a2
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
 ; RV64ZBA-NEXT:    ld a0, 0(a0)
 ; RV64ZBA-NEXT:    ret
   %a = getelementptr inbounds [8 x i64], ptr %p, i64 %idx1, i64 %idx2
@@ -2308,9 +2301,8 @@ define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
 ; RV64ZBA-LABEL: array_index_lshr_sh3_sh3:
 ; RV64ZBA:       # %bb.0:
 ; RV64ZBA-NEXT:    srli a1, a1, 58
-; RV64ZBA-NEXT:    slli a1, a1, 6
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh3add a1, a1, a2
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
 ; RV64ZBA-NEXT:    ld a0, 0(a0)
 ; RV64ZBA-NEXT:    ret
   %shr = lshr i64 %idx1, 58