-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Move RISCVVMV0Elimination past pre-ra scheduling #132057
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesThis moves RISCVVMV0Elimination as late as we can likely move it, past the pre-ra machine scheduler. This means the scheduler is now able to schedule masked instructions past other masked instructions, since the scheduler goes from seeing something like this:
To
On SPEC CPU 2017 we see a geomean ~3% reduction in the total number of vector registers spilled and reloaded:
There are a few changes needed here that are tied to moving the pass:
@wangpc-pp @preames I tried to minimize the diff by editing RISCVVectorMaskDAGMutation to also add edges between vmv0 virtual register uses, but this was a no-op. The diffs from this patch aren't between V0 users + producers, but rather between scheduling differences between e.g. two instructions that use the same mask. Separately, after this patch we can remove RISCVVectorMaskDAGMutation with no test diff, so it supersedes its functionality. I can do that as a follow up if we land this. Patch is 2.13 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132057.diff 157 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index f78e5f8147d98..080b660cc48ca 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -445,6 +445,7 @@ FunctionPass *RISCVPassConfig::createRVVRegAllocPass(bool Optimized) {
}
bool RISCVPassConfig::addRegAssignAndRewriteFast() {
+ addPass(createRISCVVMV0EliminationPass());
addPass(createRVVRegAllocPass(false));
addPass(createRISCVInsertVSETVLIPass());
if (TM->getOptLevel() != CodeGenOptLevel::None &&
@@ -454,6 +455,7 @@ bool RISCVPassConfig::addRegAssignAndRewriteFast() {
}
bool RISCVPassConfig::addRegAssignAndRewriteOptimized() {
+ addPass(createRISCVVMV0EliminationPass());
addPass(createRVVRegAllocPass(true));
addPass(createVirtRegRewriter(false));
addPass(createRISCVInsertVSETVLIPass());
@@ -618,8 +620,6 @@ void RISCVPassConfig::addPreRegAlloc() {
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMachinePipeliner)
addPass(&MachinePipelinerID);
-
- addPass(createRISCVVMV0EliminationPass());
}
void RISCVPassConfig::addFastRegAlloc() {
diff --git a/llvm/lib/Target/RISCV/RISCVVMV0Elimination.cpp b/llvm/lib/Target/RISCV/RISCVVMV0Elimination.cpp
index 9270a5b98a142..4e76450998400 100644
--- a/llvm/lib/Target/RISCV/RISCVVMV0Elimination.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVMV0Elimination.cpp
@@ -34,6 +34,9 @@
#ifndef NDEBUG
#include "llvm/ADT/PostOrderIterator.h"
#endif
+#include "llvm/CodeGen/LiveDebugVariables.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
@@ -51,15 +54,14 @@ class RISCVVMV0Elimination : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
- MachineFunctionProperties getRequiredProperties() const override {
- // TODO: We could move this closer to regalloc, out of SSA, which would
- // allow scheduling past mask operands. We would need to preserve live
- // intervals.
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::IsSSA);
+ AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
+ AU.addPreserved<LiveIntervalsWrapperPass>();
+ AU.addPreserved<SlotIndexesWrapperPass>();
+ AU.addPreserved<LiveDebugVariablesWrapperLegacy>();
+ AU.addPreserved<LiveStacksWrapperLegacy>();
+
+ MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -88,12 +90,14 @@ bool RISCVVMV0Elimination::runOnMachineFunction(MachineFunction &MF) {
return false;
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const TargetInstrInfo *TII = ST->getInstrInfo();
+ auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
+ LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
#ifndef NDEBUG
// Assert that we won't clobber any existing reads of v0 where we need to
// insert copies.
- const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
for (MachineBasicBlock *MBB : RPOT) {
bool V0Clobbered = false;
@@ -115,7 +119,6 @@ bool RISCVVMV0Elimination::runOnMachineFunction(MachineFunction &MF) {
#endif
bool MadeChange = false;
- SmallVector<MachineInstr *> DeadCopies;
// For any instruction with a vmv0 operand, replace it with a copy to v0.
for (MachineBasicBlock &MBB : MF) {
@@ -127,23 +130,39 @@ bool RISCVVMV0Elimination::runOnMachineFunction(MachineFunction &MF) {
if (isVMV0(MCOI)) {
MachineOperand &MO = MI.getOperand(OpNo);
Register Src = MO.getReg();
- assert(MO.isUse() && MO.getSubReg() == RISCV::NoSubRegister &&
- Src.isVirtual() && "vmv0 use in unexpected form");
-
- // Peek through a single copy to match what isel does.
- if (MachineInstr *SrcMI = MRI.getVRegDef(Src);
- SrcMI->isCopy() && SrcMI->getOperand(1).getReg().isVirtual() &&
- SrcMI->getOperand(1).getSubReg() == RISCV::NoSubRegister) {
- // Delete any dead copys to vmv0 to avoid allocating them.
- if (MRI.hasOneNonDBGUse(Src))
- DeadCopies.push_back(SrcMI);
- Src = SrcMI->getOperand(1).getReg();
+ assert(MO.isUse() && Src.isVirtual() &&
+ "vmv0 use in unexpected form");
+
+ // If undef don't emit a copy, since the IMPLICIT_DEF Src will no
+ // longer exist at this stage.
+ if (MO.isUndef()) {
+ MO.setReg(RISCV::V0);
+ MadeChange = true;
+ break;
}
- BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(RISCV::COPY), RISCV::V0)
- .addReg(Src);
-
+ MachineInstr *Copy = BuildMI(MBB, MI, MI.getDebugLoc(),
+ TII->get(RISCV::COPY), RISCV::V0)
+ .addReg(Src, 0, MO.getSubReg());
MO.setReg(RISCV::V0);
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*Copy);
+ SlotIndex CopySI = LIS->getInstructionIndex(*Copy).getRegSlot();
+ SlotIndex MISI = LIS->getInstructionIndex(MI).getRegSlot();
+
+ assert(std::distance(TRI->regunits(RISCV::V0).begin(),
+ TRI->regunits(RISCV::V0).end()) == 1);
+ unsigned Unit = *TRI->regunits(RISCV::V0).begin();
+
+ // Create a new definition of V0 from Copy To MI.
+ if (LiveRange *LR = LIS->getCachedRegUnit(Unit)) {
+ VNInfo *VNI = LR->getNextValue(CopySI, LIS->getVNInfoAllocator());
+ LR->addSegment(LiveInterval::Segment(CopySI, MISI, VNI));
+ }
+
+ // Shrink Src's interval now that MI doesn't use it.
+ LIS->shrinkToUses(&LIS->getInterval(Src));
+ }
MadeChange = true;
break;
}
@@ -151,9 +170,6 @@ bool RISCVVMV0Elimination::runOnMachineFunction(MachineFunction &MF) {
}
}
- for (MachineInstr *MI : DeadCopies)
- MI->eraseFromParent();
-
if (!MadeChange)
return false;
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index 694662eab1681..3e7f59d4c5f5a 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -43,10 +43,10 @@
; CHECK-NEXT: RISC-V Insert Read/Write CSR Pass
; CHECK-NEXT: RISC-V Insert Write VXRM Pass
; CHECK-NEXT: RISC-V Landing Pad Setup
-; CHECK-NEXT: RISC-V VMV0 Elimination
; CHECK-NEXT: Init Undef Pass
; CHECK-NEXT: Eliminate PHI nodes for register allocation
; CHECK-NEXT: Two-Address instruction pass
+; CHECK-NEXT: RISC-V VMV0 Elimination
; CHECK-NEXT: Fast Register Allocator
; CHECK-NEXT: RISC-V Insert VSETVLI pass
; CHECK-NEXT: Fast Register Allocator
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index beef7a574dc4f..c24152a021c61 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -128,7 +128,6 @@
; CHECK-NEXT: RISC-V Insert Read/Write CSR Pass
; CHECK-NEXT: RISC-V Insert Write VXRM Pass
; CHECK-NEXT: RISC-V Landing Pad Setup
-; CHECK-NEXT: RISC-V VMV0 Elimination
; CHECK-NEXT: Detect Dead Lanes
; CHECK-NEXT: Init Undef Pass
; CHECK-NEXT: Process Implicit Definitions
@@ -141,6 +140,7 @@
; CHECK-NEXT: Register Coalescer
; CHECK-NEXT: Rename Disconnected Subregister Components
; CHECK-NEXT: Machine Instruction Scheduler
+; CHECK-NEXT: RISC-V VMV0 Elimination
; CHECK-NEXT: Machine Block Frequency Analysis
; CHECK-NEXT: Debug Variable Analysis
; CHECK-NEXT: Live Stack Slot Analysis
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index df27b096967a2..5ca4dc8e21c44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -15,20 +15,20 @@ define <vscale x 1 x i8> @vp_bitreverse_nxv1i8(<vscale x 1 x i8> %va, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
; CHECK-NEXT: vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: li a0, 51
; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t
-; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: vand.vi v8, v8, 15, v0.t
; CHECK-NEXT: vor.vv v8, v8, v9, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
@@ -81,20 +81,20 @@ define <vscale x 2 x i8> @vp_bitreverse_nxv2i8(<vscale x 2 x i8> %va, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
; CHECK-NEXT: vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: li a0, 51
; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t
-; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: vand.vi v8, v8, 15, v0.t
; CHECK-NEXT: vor.vv v8, v8, v9, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
@@ -147,20 +147,20 @@ define <vscale x 4 x i8> @vp_bitreverse_nxv4i8(<vscale x 4 x i8> %va, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
; CHECK-NEXT: vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: li a0, 51
; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t
-; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: vand.vi v8, v8, 15, v0.t
; CHECK-NEXT: vor.vv v8, v8, v9, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
@@ -213,20 +213,20 @@ define <vscale x 8 x i8> @vp_bitreverse_nxv8i8(<vscale x 8 x i8> %va, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-NEXT: vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: li a0, 51
; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t
-; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: vand.vi v8, v8, 15, v0.t
; CHECK-NEXT: vor.vv v8, v8, v9, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
@@ -279,20 +279,20 @@ define <vscale x 16 x i8> @vp_bitreverse_nxv16i8(<vscale x 16 x i8> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-NEXT: vand.vi v10, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: li a0, 51
; CHECK-NEXT: vsll.vi v10, v10, 4, v0.t
-; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: vand.vi v8, v8, 15, v0.t
; CHECK-NEXT: vor.vv v8, v8, v10, v0.t
; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v10, v10, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v10, v10, a0, v0.t
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
; CHECK-NEXT: vor.vv v8, v10, v8, v0.t
; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v10, v10, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v10, v10, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
@@ -345,20 +345,20 @@ define <vscale x 32 x i8> @vp_bitreverse_nxv32i8(<vscale x 32 x i8> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
; CHECK-NEXT: vand.vi v12, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: li a0, 51
; CHECK-NEXT: vsll.vi v12, v12, 4, v0.t
-; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: vand.vi v8, v8, 15, v0.t
; CHECK-NEXT: vor.vv v8, v8, v12, v0.t
; CHECK-NEXT: vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v12, v12, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v12, v12, a0, v0.t
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
; CHECK-NEXT: vor.vv v8, v12, v8, v0.t
; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v12, v12, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v12, v12, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
@@ -411,20 +411,20 @@ define <vscale x 64 x i8> @vp_bitreverse_nxv64i8(<vscale x 64 x i8> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vand.vi v16, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: li a0, 51
; CHECK-NEXT: vsll.vi v16, v16, 4, v0.t
-; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t
; CHECK-NEXT: vand.vi v8, v8, 15, v0.t
-; CHECK-NEXT: vor.vv v16, v8, v16, v0.t
-; CHECK-NEXT: vsrl.vi v8, v16, 2, v0.t
-; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsrl.vi v24, v8, 2, v0.t
+; CHECK-NEXT: vand.vx v16, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v8, v24, a0, v0.t
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsll.vi v16, v16, 2, v0.t
; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v16, v16, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
@@ -477,27 +477,27 @@ define <vscale x 1 x i16> @vp_bitreverse_nxv1i16(<vscale x 1 x i16> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT: lui a0, 1
; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT: addi a0, a0, -241
+; CHECK-NEXT: lui a0, 1
; CHECK-NEXT: vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT: addi a0, a0, -241
; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: lui a0, 3
; CHECK-NEXT: addi a0, a0, 819
; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: lui a0, 5
; CHECK-NEXT: addi a0, a0, 1365
; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
@@ -557,27 +557,27 @@ define <vscale x 2 x i16> @vp_bitreverse_nxv2i16(<vscale x 2 x i16> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT: lui a0, 1
; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT: addi a0, a0, -241
+; CHECK-NEXT: lui a0, 1
; CHECK-NEXT: vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT: addi a0, a0, -241
; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: lui a0, 3
; CHECK-NEXT: addi a0, a0, 819
; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: lui a0, 5
; CHECK-NEXT: addi a0, a0, 1365
; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
@@ -637,27 +637,27 @@ define <vscale x 4 x i16> @vp_bitreverse_nxv4i16(<vscale x 4 x i16> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t
-; CHECK-NEXT: lui a0, 1
; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT: addi a0, a0, -241
+; CHECK-NEXT: lui a0, 1
; CHECK-NEXT: vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT: addi a0, a0, -241
; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: lui a0, 3
; CHECK-NEXT: addi a0, a0, 819
; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: lui a0, 5
; CHECK-NEXT: addi a0, a0, 1365
; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v9, v9, a0, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
; CHECK-NEXT: vor.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
@@ -717,27 +717,27 @@ define <vscale x 8 x i16> @vp_bitreverse_nxv8i16(<vscale x 8 x i16> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t
-; CHECK-NEXT: lui a0, 1
; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT: addi a0, a0, -241
+; CHECK-NEXT: lui a0, 1
; CHECK-NEXT: vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT: addi a0, a0, -241
; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT: vand.vx v10, v10, a0, v0.t
; CHECK-NEXT: vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vand.vx v10, v10, a0, v0.t
; CHECK-NEXT: lui a0, 3
; CHECK-NEXT: addi a0, a0, 819
; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t
; CHECK-NEXT: vor.vv v8, v10, v8, v0.t
; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT: vand.vx v10, v10, ...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' f536f715800658567ba619ad8a2b361d4d715fe9 6bef43f7efc5a4de31b64ab07c660deba07377d3 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp llvm/lib/Target/RISCV/RISCVVMV0Elimination.cpp llvm/test/CodeGen/RISCV/O0-pipeline.ll llvm/test/CodeGen/RISCV/O3-pipeline.ll llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll llvm/test/CodeGen/RISCV/rvv/expandload.ll llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp-interleave.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll llvm/test/CodeGen/RISCV/rvv/floor-vp.ll llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll llvm/test/CodeGen/RISCV/rvv/pr88576.ll llvm/test/CodeGen/RISCV/rvv/rint-vp.ll llvm/test/CodeGen/RISCV/rvv/round-vp.ll llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll llvm/test/CodeGen/RISCV/rvv/vector-splice.ll llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll llvm/test/CodeGen/RISCV/rvv/vp-vaaddu.ll llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll llvm/test/CodeGen/RISCV/rvv/vpload.ll llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll llvm/test/CodeGen/RISCV/rvv/vpstore.ll llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
FYI, I started looking at this from the perspective of the test diffs. They are unfortunately massive, and include both improvements and regressions, and a few oddities. I need to convince myself of the a couple oddities I spotted, but I think we're going to need performance data to justify that this is actually net positive. |
I went and looked into this. The one and only case in check-llvm is the following:
There doesn't seem to be anything too oddly specific about @foo from test/CodeGen/RISCV/rvv/pr88576.ll (which is where this came from). This looks correct (i.e. is a valid subreg usage). It is interesting that the use of the subregister bit ends up so narrow here. Tracing through print-after-all, it looks like RegisterCoalescer eliminates a copy and creates the sub-register use. |
For some regressions, I don't know if they are related to a reg pressure set limit issue I have been looking at for a long time (but I was occupied by other urgent issues so I haven't fixed it :-( ). // Get the register unit pressure limit for this dimension.
// This limit must be adjusted dynamically for reserved registers.
unsigned RISCVGenRegisterInfo::
getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const {
static const uint8_t PressureLimitTable[] = {
2, // 0: GPRC_and_SR07
2, // 1: GPRX0
2, // 2: SP
2, // 3: GPRX7
3, // 4: GPRX1
8, // 5: FPR16C
8, // 6: GPRF16C
8, // 7: SR07
8, // 8: VMV0
14, // 9: GPRF16C_with_SR07
16, // 10: GPRTC
24, // 11: VRM8NoV0
32, // 12: FPR16
32, // 13: VM
33, // 14: GPRAll
};
return PressureLimitTable[Idx];
} Apparently, the limit 8 of Anyway, I really appreciate this work and am looking forward to make it go farther! |
Could I ask you to file an issue with your failure reproducers? I just tried the limit of 2 locally, and didn't see any obvious problems in make check-llvm. I've got a little bit of weirdness in test changes, but no failures. |
@wangpc-pp I've also been meaning to look at |
Sorry, I think my wording has just misled you to the wrong place. I meant we may have some weirdness (not failures) because of we model the register pressure set limit too high/ too low. What you saw are expected I think. :-) |
Looks like there's a 10% regression in x264 with this patch: https://lnt.lukelau.me/db_default/v4/nts/348?show_delta=yes&show_previous=yes&show_stddev=yes&show_mad=yes&show_all=yes&show_all_samples=yes&num_comparison_runs=0&test_filter=&test_min_value_filter=&aggregation_fn=min&MW_confidence_lv=0.05&compare_to=352&submit=Update From a quick glance there does seem to be a lot more spilling in one of the kernels, so I think you may be right about the register pressure calculation being off! Will investigate further |
Some updates: we can't simply set the limit of VMV0 to smaller values (2 for example).
The reason why it is 8 is because we have some synthesized register classes and VMV0 is subsumed by these classes. This causes the unreasonable higher limit of VMV0. There must be something wrong somewhere but I'm still investigating (the way we model register class is complicated and I don't know if there is a person who is still working on that since it hasn't been changed for decades 😢). |
Yes I was looking a bit deeper into this too, I found this explanation which might be more useful https://lists.llvm.org/pipermail/llvm-dev/2016-May/100019.html If I'm understanding this correctly the limit is the maximum number of register units that might interfere with the virtual register set? So if something is assigned to v0m8 it would add 8 to VMV0's pressure set? |
Yes, we will add the return value of
|
Oh I think I see the underlying cause of the x264 regression, it's #107532. I.e. the machine scheduler is now more free to reschedule masked pseudos, which results in a lot of vector spills in x264_pixel_satd_16x16 It happens under -flto -O3 without a scheduling model. Either applying #126608 or using Specifically x264_pixel_satd_16x16 is completely inlined, and in it there's a few masked These must have been acting as a barrier preventing the aggressive rescheduling + spilling: 2290: 3ed134d7 vslidedown.vi v9, v13, 0x2
2294: 3c8134d7 vslidedown.vi v9, v8, 0x2, v0.t
2298: c900f057 vsetivli zero, 0x1, e32, m1, tu, ma
229c: 5e068457 vmv.v.v v8, v13
22a0: cd027057 vsetivli zero, 0x4, e32, m1, ta, ma
22a4: 029406d7 vadd.vv v13, v9, v8
22a8: 0a848457 vsub.vv v8, v8, v9
22ac: 3a8136d7 vslideup.vi v13, v8, 0x2
22b0: 020a8407 vle8.v v8, (s5)
22b4: 020b0487 vle8.v v9, (s6)
22b8: 0c607057 vsetvli zero, zero, e8, mf4, ta, ma
22bc: ca84a7d7 vwsubu.vv v15, v8, v9
22c0: 0d007057 vsetvli zero, zero, e32, m1, ta, ma
22c4: 4af32457 vzext.vf2 v8, v15
22c8: 96883457 vsll.vi v8, v8, 0x10
22cc: 02850457 vadd.vv v8, v8, v10
22d0: cd817057 vsetivli zero, 0x2, e64, m1, ta, ma
22d4: a28544d7 vsrl.vx v9, v8, a0
22d8: 96854557 vsll.vx v10, v8, a0
22dc: 2aa484d7 vor.vv v9, v10, v9
22e0: c5027057 vsetivli zero, 0x4, e32, m1, ta, mu
22e4: 02940557 vadd.vv v10, v9, v8
22e8: 0a940457 vsub.vv v8, v9, v8
22ec: 3ea134d7 vslidedown.vi v9, v10, 0x2
22f0: 3c8134d7 vslidedown.vi v9, v8, 0x2, v0.t I think before this can land we need to either enable MicroOpBufferSize=1 by relanding #126608 (it might be the case where it needs landed in tandem with this patch?), or choose a scheduling model by default (we might need to add a generic in-order model). |
This moves RISCVVMV0Elimination as late as we can likely move it, past the pre-ra machine scheduler.
This means the scheduler is now able to schedule masked instructions past other masked instructions, since the scheduler goes from seeing something like this:
To
On SPEC CPU 2017 we see a geomean ~3% reduction in the total number of vector registers spilled and reloaded:
There are a few changes needed here that are tied to moving the pass:
@wangpc-pp @preames I tried to minimize the diff by editing RISCVVectorMaskDAGMutation to also add edges between vmv0 virtual register uses and producers, but this was a no-op. The diffs from this patch aren't between V0 users + producers, but rather between scheduling differences between e.g. two instructions that use the same mask.
Separately, after this patch we can remove RISCVVectorMaskDAGMutation with no test diff, so it supersedes its functionality. I can do that as a follow up if we land this.