Skip to content

[RISCV] Support load clustering in the MachineScheduler (off by default) #73754

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineCombinerPattern.h"
Expand Down Expand Up @@ -2231,6 +2232,60 @@ bool RISCVInstrInfo::getMemOperandsWithOffsetWidth(
return true;
}

// TODO: This was copied from SIInstrInfo. Could it be lifted to a common
// helper?
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
ArrayRef<const MachineOperand *> BaseOps1,
const MachineInstr &MI2,
ArrayRef<const MachineOperand *> BaseOps2) {
// Only examine the first "base" operand of each instruction, on the
// assumption that it represents the real base address of the memory access.
// Other operands are typically offsets or indices from this base address.
if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
return true;

if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
return false;

auto MO1 = *MI1.memoperands_begin();
auto MO2 = *MI2.memoperands_begin();
if (MO1->getAddrSpace() != MO2->getAddrSpace())
return false;

auto Base1 = MO1->getValue();
auto Base2 = MO2->getValue();
if (!Base1 || !Base2)
return false;
Base1 = getUnderlyingObject(Base1);
Base2 = getUnderlyingObject(Base2);

if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
return false;

return Base1 == Base2;
}

bool RISCVInstrInfo::shouldClusterMemOps(
ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
// should not be clustered
if (!BaseOps1.empty() && !BaseOps2.empty()) {
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;
} else if (!BaseOps1.empty() || !BaseOps2.empty()) {
// If only one base op is empty, they do not have the same base ptr
return false;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi. Can there be a hook to implement the load/store fusion (like with same base register and different offset) like PowerPC does?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What kind of logic were you imagining? The shouldClusterMemOps logic we're using right now should in general cluster in far more cases than the PPC implementation (which as you say, has a hook to pick up just those cases what may support load/store fusion).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh. What I mean is too generic. And constraint to require the same opcode with two cluster load/store and the memory addresses also need consecutive, which means second offset = first offset + first width.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ooops. I see the follow-up PR #73778, that what I want.

// TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
// indicate they likely share a cache line.
return ClusterSize <= 4;
}

// Set BaseReg (the base register operand), Offset (the byte offset being
// accessed) and the access Width of the passed instruction that reads/writes
// memory. Returns false if the instruction does not read/write memory or the
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/RISCV/RISCVInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
const TargetRegisterInfo *TRI) const override;

bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2,
unsigned ClusterSize,
unsigned NumBytes) const override;

bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
const MachineOperand *&BaseOp,
int64_t &Offset, unsigned &Width,
Expand Down
15 changes: 12 additions & 3 deletions llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ static cl::opt<bool>
cl::desc("Enable Split RegisterAlloc for RVV"),
cl::init(false));

static cl::opt<bool> EnableMISchedLoadClustering(
"riscv-misched-load-clustering", cl::Hidden,
cl::desc("Enable load clustering in the machine scheduler"),
cl::init(false));

extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
Expand Down Expand Up @@ -345,12 +350,16 @@ class RISCVPassConfig : public TargetPassConfig {
ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext *C) const override {
const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
ScheduleDAGMILive *DAG = nullptr;
if (EnableMISchedLoadClustering) {
DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
}
if (ST.hasMacroFusion()) {
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG = DAG ? DAG : createGenericSchedLive(C);
DAG->addMutation(createRISCVMacroFusionDAGMutation());
return DAG;
}
return nullptr;
return DAG;
}

ScheduleDAGInstrs *
Expand Down
43 changes: 43 additions & 0 deletions llvm/test/CodeGen/RISCV/misched-load-clustering.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
; REQUIRES: asserts
; RUN: llc -mtriple=riscv32 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
; RUN: | FileCheck -check-prefix=NOCLUSTER %s
; RUN: llc -mtriple=riscv64 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
; RUN: | FileCheck -check-prefix=NOCLUSTER %s
; RUN: llc -mtriple=riscv32 -riscv-misched-load-clustering -verify-misched \
; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
; RUN: | FileCheck -check-prefix=LDCLUSTER %s
; RUN: llc -mtriple=riscv64 -riscv-misched-load-clustering -verify-misched \
; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
; RUN: | FileCheck -check-prefix=LDCLUSTER %s


define i32 @load_clustering_1(ptr nocapture %p) {
; NOCLUSTER: ********** MI Scheduling **********
; NOCLUSTER-LABEL: load_clustering_1:%bb.0
; NOCLUSTER: *** Final schedule for %bb.0 ***
; NOCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
; NOCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
; NOCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
; NOCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
;
; LDCLUSTER: ********** MI Scheduling **********
; LDCLUSTER-LABEL: load_clustering_1:%bb.0
; LDCLUSTER: *** Final schedule for %bb.0 ***
; LDCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
; LDCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
; LDCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
; LDCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
entry:
%arrayidx0 = getelementptr inbounds i32, ptr %p, i32 3
%val0 = load i32, i32* %arrayidx0
%arrayidx1 = getelementptr inbounds i32, ptr %p, i32 2
%val1 = load i32, i32* %arrayidx1
%tmp0 = add i32 %val0, %val1
%arrayidx2 = getelementptr inbounds i32, ptr %p, i32 1
%val2 = load i32, i32* %arrayidx2
%tmp1 = add i32 %tmp0, %val2
%arrayidx3 = getelementptr inbounds i32, ptr %p, i32 4
%val3 = load i32, i32* %arrayidx3
%tmp2 = add i32 %tmp1, %val3
ret i32 %tmp2
}