-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[MachineScheduler][RISCV] Release the pending queue base on condition #125468
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Piyou Chen (BeMg) ChangesDuring scheduling, the SUnit will be pushed into the pending queue when a hazard occurs. For this reason, those SUnits will not be considered in register pressure measurement, which can cause register spilling in high register pressure regions. This patch adds a hook to release nodes from the pending queue based on target register pressure information, and includes an option to control this feature. This may help avoid spill/reload operations for in-order cores in high register pressure regions.
Patch is 491.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125468.diff 65 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 660670ccdcd75b4..47809606ff40754 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1068,6 +1068,13 @@ class SchedBoundary {
/// Dump the state of the information that tracks resource usage.
void dumpReservedCycles() const;
void dumpScheduledState() const;
+
+ void bumpCycleUntilReleaseSUFromPending(SUnit *SU) {
+ while (!Pending.empty() && llvm::find(Pending, SU) != Pending.end()) {
+ bumpCycle(CurrCycle + 1);
+ releasePending();
+ }
+ }
};
/// Base class for GenericScheduler. This class maintains information about
@@ -1262,6 +1269,8 @@ class GenericScheduler : public GenericSchedulerBase {
BotCand.SU = nullptr;
}
+ void bumpCycleUntilReleaseSUFromPending(bool IsTop);
+
void registerRoots() override;
protected:
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 114149ff53d850b..270b9cd8de1df58 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1190,6 +1190,23 @@ class TargetRegisterInfo : public MCRegisterInfo {
return false;
}
+ /// Based on the target and current register pressure information from the
+ /// Scheduler, determine whether to release the node in the pending queue
+ virtual bool
+ needReleasePendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> MaxSetPressure) const {
+ return false;
+ }
+
+ /// For each SUnit, determine whether to release it
+ /// from the pending queue based on the register pressure changes
+ /// associated with that SUnit.
+ virtual bool needReleaseSUFromPendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const {
+ return false;
+ }
+
//===--------------------------------------------------------------------===//
/// Debug information queries.
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 393530f56cc27ee..586c8857bb199fc 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -161,6 +161,10 @@ static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
cl::desc("Enable memop clustering."),
cl::init(true));
+static cl::opt<bool>
+ EnableReleasePendingQ("misched-release-pending-queue", cl::Hidden,
+ cl::desc("Release the pending queue"),
+ cl::init(true));
static cl::opt<bool>
ForceFastCluster("force-fast-cluster", cl::Hidden,
cl::desc("Switch to fast cluster algorithm with the lost "
@@ -3656,6 +3660,37 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
}
}
+void GenericScheduler::bumpCycleUntilReleaseSUFromPending(bool IsTop) {
+ if (!DAG->isTrackingPressure())
+ return;
+ auto releasePending = [&](ReadyQueue &Q, const RegPressureTracker &RegP,
+ ArrayRef<unsigned> MaxSetP, SchedBoundary &SchedB) {
+ for (SUnit *SU : Q) {
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RegP);
+ CandPolicy TempPolicy;
+ SchedCandidate TryCand(TempPolicy);
+ initCandidate(TryCand, SU, IsTop, RegP, TempTracker);
+ PressureDiff PDiff = DAG->getPressureDiff(SU);
+ SmallVector<unsigned> PSetIDs;
+ SmallVector<int> UnitIncs;
+ for (const auto &PChange : PDiff) {
+ if (!PChange.isValid())
+ continue;
+ PSetIDs.push_back(PChange.getPSet());
+ UnitIncs.push_back(PChange.getUnitInc());
+ }
+ if (TRI->needReleaseSUFromPendingQueue(DAG->MF, PSetIDs, UnitIncs))
+ SchedB.bumpCycleUntilReleaseSUFromPending(SU);
+ }
+ };
+ if (IsTop)
+ releasePending(Top.Pending, DAG->getTopRPTracker(),
+ DAG->getTopRPTracker().getPressure().MaxSetPressure, Top);
+ else
+ releasePending(Bot.Pending, DAG->getBotRPTracker(),
+ DAG->getBotRPTracker().getPressure().MaxSetPressure, Bot);
+}
+
/// Pick the best candidate node from either the top or bottom queue.
SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
// Schedule as far as possible in the direction of no choice. This is most
@@ -3741,6 +3776,16 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;
}
+
+ if (EnableReleasePendingQ && !RegionPolicy.OnlyBottomUp &&
+ TRI->needReleasePendingQueue(
+ DAG->MF, DAG->getTopRPTracker().getPressure().MaxSetPressure))
+ bumpCycleUntilReleaseSUFromPending(/*IsTop=*/true);
+ if (EnableReleasePendingQ && !RegionPolicy.OnlyTopDown &&
+ TRI->needReleasePendingQueue(
+ DAG->MF, DAG->getBotRPTracker().getPressure().MaxSetPressure))
+ bumpCycleUntilReleaseSUFromPending(/*IsTop=*/false);
+
SUnit *SU;
do {
if (RegionPolicy.OnlyTopDown) {
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index b0a52698c1e9f10..91605c5acda0cb2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -954,3 +954,36 @@ bool RISCVRegisterInfo::getRegAllocationHints(
return BaseImplRetVal;
}
+
+bool RISCVRegisterInfo::needReleasePendingQueue(
+ MachineFunction &MF, ArrayRef<unsigned> MaxSetPressure) const {
+ for (unsigned Idx = 0; Idx < MaxSetPressure.size(); Idx++) {
+ // Consider only the RVV Register, as RVV spilling/reloading has higher
+ // potential costs than hazards.
+ if (!StringRef(getRegPressureSetName(Idx)).starts_with("VM") &&
+ !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+ continue;
+ const unsigned RVVRegPressureThreshold = 7;
+ if (MaxSetPressure[Idx] + RVVRegPressureThreshold >
+ getRegPressureSetLimit(MF, Idx)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool RISCVRegisterInfo::needReleaseSUFromPendingQueue(
+ MachineFunction &MF, ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const {
+ const int UnitIncRVVRegPressureThreshold = -3;
+ for (unsigned Idx = 0; Idx < PSetID.size(); Idx++) {
+ // Consider only the RVV Register, as RVV spilling/reloading has higher
+ // potential costs than hazards.
+ if (!StringRef(getRegPressureSetName(PSetID[Idx])).starts_with("VM") &&
+ !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+ continue;
+ if (UnitInc[Idx] < UnitIncRVVRegPressureThreshold)
+ return true;
+ }
+ return false;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3ab79694e175c8a..faf81b2d8b73d65 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,6 +144,13 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
static bool isRVVRegClass(const TargetRegisterClass *RC) {
return RISCVRI::isVRegClass(RC->TSFlags);
}
+ bool
+ needReleasePendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> MaxSetPressure) const override;
+
+ bool needReleaseSUFromPendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const override;
};
} // namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
index 1ed84316d4484cd..cd795f722676bd9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
@@ -1137,38 +1137,38 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-NEXT: li a1, 56
; RV32-NEXT: li a2, 40
; RV32-NEXT: lui a3, 16
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: addi a5, sp, 8
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: vsrl.vx v24, v8, a2
-; RV32-NEXT: addi a0, a3, -256
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsll.vx v0, v8, a1
-; RV32-NEXT: vand.vx v24, v24, a0
+; RV32-NEXT: vand.vx v24, v24, a3
; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v16, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a3
; RV32-NEXT: vsll.vx v16, v16, a2
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v0, (a5), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a4
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 24
+; RV32-NEXT: lui a1, 4080
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v0, a1
; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v24, v24, v0
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v24
-; RV32-NEXT: vand.vv v16, v8, v0
-; RV32-NEXT: vand.vx v8, v8, a4
+; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
+; RV32-NEXT: vand.vx v8, v8, a1
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vsll.vi v16, v16, 8
; RV32-NEXT: vor.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 4d34621cd5f243c..8ae560c07e21016 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -2288,66 +2288,68 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v24, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: li a2, 56
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
; RV32-NEXT: addi a1, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v16, v24, a5, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v8, (a6), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT: addi a5, sp, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT: vand.vx v24, v24, a1, v0.t
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsrl.vx v16, v24, a2, v0.t
+; RV32-NEXT: vsrl.vx v8, v24, a4, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a3, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT: vand.vx v16, v8, a5, v0.t
+; RV32-NEXT: vsrl.vi v8, v24, 8, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -2497,40 +2499,40 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: addi a6, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2
-; RV32-NEXT: addi a1, a3, -256
; RV32-NEXT: vsrl.vx v24, v8, a2
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsrl.vx v0, v8, a4
-; RV32-NEXT: vand.vx v0, v0, a1
+; RV32-NEXT: vand.vx v0, v0, a3
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v24, v8, a1
+; RV32-NEXT: addi a6, sp, 16
+; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v24, v8, a3
; RV32-NEXT: vsll.vx v24, v24, a4
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v24, (a6), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a5
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vand.vv v24, v8, v24
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v24, a5
+; RV32-NEXT: vsrl.vi v24, v8, 8
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v0, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v24, v8, v24
+; RV32-NEXT: vsll.vi v16, v16, 8
+; RV32-NEXT: vor.vv v24, v8, v16
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vor.vv v8, v0, v8
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: lui a3, 349525
@@ -2673,66 +2675,68 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v24, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: li a2, 56
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
; RV32-NEXT: addi a1, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v16, v24, a5, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v8, (a6), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT: addi a5, sp, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16,...
[truncated]
|
@@ -1068,6 +1068,13 @@ class SchedBoundary { | |||
/// Dump the state of the information that tracks resource usage. | |||
void dumpReservedCycles() const; | |||
void dumpScheduledState() const; | |||
|
|||
void bumpCycleUntilReleaseSUFromPending(SUnit *SU) { | |||
while (!Pending.empty() && llvm::find(Pending, SU) != Pending.end()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need !Pending.empty()
since llvm::find
will return end()
anyway for an empty container.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And what the compile-time impact of this patch? It seems to be costly here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you still see the spill/reload even with real CPU scheduling model? I wonder if the problem you are trying to solve is the same as #107532?
// potential costs than hazards. | ||
if (!isRVVPressureSetIdx(PSetID[Idx])) | ||
continue; | ||
if (UnitInc[Idx] < UnitIncRVVRegPressureThreshold) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does both TopDown and BottomUp have the same threshold? IIRC, they have different computations of register pressure so the threshold may be different?
In most cases, yes. The lit changes in #126608 will cover this patch's lit changes.
This patch aims to resolve this type of problem (balance hazard and spilling) that setting MicroOpBufferSize = 1 cannot address. |
I think we can close this pull request now and wait the example that make real cpu exist spill issue (between hazard and register pressure). Thank you all for your review comments. :) |
During scheduling, the SUnit will be pushed into the pending queue when a hazard occurs. For this reason, those SUnits will not be considered in register pressure measurement, which can cause register spilling in high register pressure regions.
This patch adds a hook to release nodes from the pending queue based on target register pressure information, and includes an option to control this feature. This may help avoid spill/reload operations for in-order cores in high register pressure regions.