Skip to content

[RISCV][RFC/WIP] Enable load clustering in SelectionDAG scheduler #74600

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions llvm/include/llvm/CodeGen/MachineScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -1347,13 +1347,19 @@ ScheduleDAGMILive *createGenericSchedLive(MachineSchedContext *C);
/// Create a generic scheduler with no vreg liveness or DAG mutation passes.
ScheduleDAGMI *createGenericSchedPostRA(MachineSchedContext *C);

/// If ReorderWhileClustering is set to true, no attempt will be made to
/// reduce reordering due to store clustering.
std::unique_ptr<ScheduleDAGMutation>
createLoadClusterDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI);
const TargetRegisterInfo *TRI,
bool ReorderWhileClustering = false);

/// If ReorderWhileClustering is set to true, no attempt will be made to
/// reduce reordering due to store clustering.
std::unique_ptr<ScheduleDAGMutation>
createStoreClusterDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI);
const TargetRegisterInfo *TRI,
bool ReorderWhileClustering = false);

std::unique_ptr<ScheduleDAGMutation>
createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
Expand Down
31 changes: 20 additions & 11 deletions llvm/lib/CodeGen/MachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1743,11 +1743,14 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
const TargetInstrInfo *TII;
const TargetRegisterInfo *TRI;
bool IsLoad;
bool ReorderWhileClustering;

public:
BaseMemOpClusterMutation(const TargetInstrInfo *tii,
const TargetRegisterInfo *tri, bool IsLoad)
: TII(tii), TRI(tri), IsLoad(IsLoad) {}
const TargetRegisterInfo *tri, bool IsLoad,
bool ReorderWhileClustering)
: TII(tii), TRI(tri), IsLoad(IsLoad),
ReorderWhileClustering(ReorderWhileClustering) {}

void apply(ScheduleDAGInstrs *DAGInstrs) override;

Expand All @@ -1763,14 +1766,16 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
class StoreClusterMutation : public BaseMemOpClusterMutation {
public:
StoreClusterMutation(const TargetInstrInfo *tii,
const TargetRegisterInfo *tri)
: BaseMemOpClusterMutation(tii, tri, false) {}
const TargetRegisterInfo *tri,
bool ReorderWhileClustering)
: BaseMemOpClusterMutation(tii, tri, false, ReorderWhileClustering) {}
};

class LoadClusterMutation : public BaseMemOpClusterMutation {
public:
LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
: BaseMemOpClusterMutation(tii, tri, true) {}
LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri,
bool ReorderWhileClustering)
: BaseMemOpClusterMutation(tii, tri, true, ReorderWhileClustering) {}
};

} // end anonymous namespace
Expand All @@ -1779,15 +1784,19 @@ namespace llvm {

std::unique_ptr<ScheduleDAGMutation>
createLoadClusterDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) {
return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
const TargetRegisterInfo *TRI,
bool ReorderWhileClustering) {
return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(
TII, TRI, ReorderWhileClustering)
: nullptr;
}

std::unique_ptr<ScheduleDAGMutation>
createStoreClusterDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) {
return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
const TargetRegisterInfo *TRI,
bool ReorderWhileClustering) {
return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(
TII, TRI, ReorderWhileClustering)
: nullptr;
}

Expand Down Expand Up @@ -1840,7 +1849,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(

SUnit *SUa = MemOpa.SU;
SUnit *SUb = MemOpb.SU;
if (SUa->NodeNum > SUb->NodeNum)
if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
std::swap(SUa, SUb);

// FIXME: Is this check really required?
Expand Down
75 changes: 72 additions & 3 deletions llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2194,6 +2194,71 @@ MachineInstr *RISCVInstrInfo::emitLdStWithAddr(MachineInstr &MemI,
.setMIFlags(MemI.getFlags());
}

bool RISCVInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
int64_t &Offset1,
int64_t &Offset2) const {
if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
return false;

auto IsLoadOpcode = [&](unsigned Opcode) {
switch (Opcode) {
case RISCV::LB:
case RISCV::LBU:
case RISCV::LH:
case RISCV::LHU:
case RISCV::FLH:
case RISCV::LW:
case RISCV::LWU:
case RISCV::FLW:
case RISCV::LD:
case RISCV::FLD:
return true;
default:
return false;
}
};

if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
!IsLoadOpcode(Load2->getMachineOpcode()))
return false;

// Check if base address and chain operands match.
if (Load1->getOperand(0) != Load2->getOperand(0))
if (Load1->getOperand(0) != Load2->getOperand(0) ||
Load1->getOperand(2) != Load2->getOperand(2))
return false;

// Determine the offsets.
if (isa<ConstantSDNode>(Load1->getOperand(1)) &&
isa<ConstantSDNode>(Load2->getOperand(1))) {
Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue();
Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue();
return true;
}

return false;
}

bool RISCVInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const {
assert(Offset2 > Offset1);

if ((Offset2 - Offset1) / 8 > 64)
return false;

// Check if the machine opcodes are different. If they are different
// then we consider them to not be of the same base address,
if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()))
return false; // FIXME: overly conservative?

// Four loads in a row should be sufficient.
if (NumLoads >= 3)
return false;

return true;
}

bool RISCVInstrInfo::getMemOperandsWithOffsetWidth(
const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
Expand Down Expand Up @@ -2282,9 +2347,13 @@ bool RISCVInstrInfo::shouldClusterMemOps(
return false;
}

// TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
// indicate they likely share a cache line.
return ClusterSize <= 4;
unsigned CacheLineSize =
BaseOps1.front()->getParent()->getMF()->getSubtarget().getCacheLineSize();
// Assume a cache line size of 64 bytes if no size is set in RISCVSubtarget.
CacheLineSize = CacheLineSize ? CacheLineSize : 64;
// Cluster if the memory operations are on the same or a neighbouring cache
// line.
return std::abs(Offset1 - Offset2) < CacheLineSize;
}

// Set BaseReg (the base register operand), Offset (the byte offset being
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/RISCV/RISCVInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,20 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
const TargetRegisterInfo *TRI) const override;

bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
int64_t &Offset2) const override;

bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
int64_t Offset2, bool OffsetIsScalable2,
unsigned ClusterSize,
unsigned NumBytes) const override;

bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1,
int64_t Offset2,
unsigned NumLoads) const override;

bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
const MachineOperand *&BaseOp,
int64_t &Offset, unsigned &Width,
Expand Down
16 changes: 3 additions & 13 deletions llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,6 @@ static cl::opt<bool>
cl::desc("Enable Split RegisterAlloc for RVV"),
cl::init(true));

static cl::opt<bool> EnableMISchedLoadClustering(
"riscv-misched-load-clustering", cl::Hidden,
cl::desc("Enable load clustering in the machine scheduler"),
cl::init(false));

extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
Expand Down Expand Up @@ -345,15 +340,10 @@ class RISCVPassConfig : public TargetPassConfig {
ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext *C) const override {
const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
ScheduleDAGMILive *DAG = nullptr;
if (EnableMISchedLoadClustering) {
DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
}
if (ST.hasMacroFusion()) {
DAG = DAG ? DAG : createGenericSchedLive(C);
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI, true));
if (ST.hasMacroFusion())
DAG->addMutation(createRISCVMacroFusionDAGMutation());
}
return DAG;
}

Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ define i32 @va1(ptr %fmt, ...) {
; RV64-NEXT: sd a2, 32(sp)
; RV64-NEXT: sd a3, 40(sp)
; RV64-NEXT: sd a4, 48(sp)
; RV64-NEXT: sd a5, 56(sp)
; RV64-NEXT: addi a0, sp, 24
; RV64-NEXT: sd a0, 8(sp)
; RV64-NEXT: lw a0, 12(sp)
; RV64-NEXT: lwu a1, 8(sp)
; RV64-NEXT: lwu a0, 8(sp)
; RV64-NEXT: lw a1, 12(sp)
; RV64-NEXT: sd a5, 56(sp)
; RV64-NEXT: sd a6, 64(sp)
; RV64-NEXT: sd a7, 72(sp)
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: or a0, a0, a1
; RV64-NEXT: slli a1, a1, 32
; RV64-NEXT: or a0, a1, a0
; RV64-NEXT: addi a1, a0, 4
; RV64-NEXT: srli a2, a1, 32
; RV64-NEXT: sw a1, 8(sp)
Expand Down Expand Up @@ -968,22 +968,22 @@ define i32 @va_large_stack(ptr %fmt, ...) {
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: sd a4, 304(a0)
; RV64-NEXT: lui a0, 24414
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: sd a5, 312(a0)
; RV64-NEXT: lui a0, 24414
; RV64-NEXT: addiw a0, a0, 280
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: sd a0, 8(sp)
; RV64-NEXT: lw a0, 12(sp)
; RV64-NEXT: lwu a1, 8(sp)
; RV64-NEXT: lwu a0, 8(sp)
; RV64-NEXT: lw a1, 12(sp)
; RV64-NEXT: lui a2, 24414
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: sd a5, 312(a2)
; RV64-NEXT: lui a2, 24414
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: sd a6, 320(a2)
; RV64-NEXT: lui a2, 24414
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: sd a7, 328(a2)
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: or a0, a0, a1
; RV64-NEXT: slli a1, a1, 32
; RV64-NEXT: or a0, a1, a0
; RV64-NEXT: addi a1, a0, 4
; RV64-NEXT: srli a2, a1, 32
; RV64-NEXT: sw a1, 8(sp)
Expand Down
38 changes: 19 additions & 19 deletions llvm/test/CodeGen/RISCV/add-before-shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -167,17 +167,17 @@ define i128 @add_wide_operand(i128 %a) nounwind {
; RV32I: # %bb.0:
; RV32I-NEXT: lw a2, 0(a1)
; RV32I-NEXT: lw a3, 4(a1)
; RV32I-NEXT: lw a4, 12(a1)
; RV32I-NEXT: lw a1, 8(a1)
; RV32I-NEXT: lw a4, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
; RV32I-NEXT: srli a5, a2, 29
; RV32I-NEXT: slli a6, a3, 3
; RV32I-NEXT: or a5, a6, a5
; RV32I-NEXT: srli a3, a3, 29
; RV32I-NEXT: slli a6, a1, 3
; RV32I-NEXT: slli a6, a4, 3
; RV32I-NEXT: or a3, a6, a3
; RV32I-NEXT: srli a1, a1, 29
; RV32I-NEXT: slli a4, a4, 3
; RV32I-NEXT: or a1, a4, a1
; RV32I-NEXT: srli a4, a4, 29
; RV32I-NEXT: slli a1, a1, 3
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: slli a2, a2, 3
; RV32I-NEXT: lui a4, 128
; RV32I-NEXT: add a1, a1, a4
Expand All @@ -200,26 +200,26 @@ define i128 @add_wide_operand(i128 %a) nounwind {
;
; RV32C-LABEL: add_wide_operand:
; RV32C: # %bb.0:
; RV32C-NEXT: lw a6, 4(a1)
; RV32C-NEXT: c.lw a3, 12(a1)
; RV32C-NEXT: c.lw a4, 0(a1)
; RV32C-NEXT: c.lw a2, 12(a1)
; RV32C-NEXT: lw a6, 0(a1)
; RV32C-NEXT: c.lw a3, 4(a1)
; RV32C-NEXT: c.lw a1, 8(a1)
; RV32C-NEXT: c.lui a5, 16
; RV32C-NEXT: c.add a3, a5
; RV32C-NEXT: c.slli a3, 3
; RV32C-NEXT: c.add a2, a5
; RV32C-NEXT: c.slli a2, 3
; RV32C-NEXT: srli a5, a1, 29
; RV32C-NEXT: c.or a3, a5
; RV32C-NEXT: srli a5, a4, 29
; RV32C-NEXT: slli a2, a6, 3
; RV32C-NEXT: c.or a2, a5
; RV32C-NEXT: srli a5, a6, 29
; RV32C-NEXT: slli a4, a3, 3
; RV32C-NEXT: c.or a4, a5
; RV32C-NEXT: c.srli a3, 29
; RV32C-NEXT: c.slli a1, 3
; RV32C-NEXT: c.or a1, a5
; RV32C-NEXT: c.slli a4, 3
; RV32C-NEXT: c.sw a4, 0(a0)
; RV32C-NEXT: c.or a1, a3
; RV32C-NEXT: c.slli a6, 3
; RV32C-NEXT: sw a6, 0(a0)
; RV32C-NEXT: c.sw a1, 8(a0)
; RV32C-NEXT: c.sw a2, 4(a0)
; RV32C-NEXT: c.sw a3, 12(a0)
; RV32C-NEXT: c.sw a4, 4(a0)
; RV32C-NEXT: c.sw a2, 12(a0)
; RV32C-NEXT: c.jr ra
;
; RV64C-LABEL: add_wide_operand:
Expand Down
Loading