-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Add a rematerializable pseudo instruction for LUI+ADDI for global addresses. #93142
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Don't fold if there are loads/stores that use the ADD_LO with a non-zero immediate offset that can't be folded based on alignment. This avoids cases where some loads/stores use the LUI directly and other loads/store uses the result of an ADDI that depends on the LUI. This increases the latency to the load that we no longer fold, but reduces the need for a temporary register to hold the LUI result for multiple uses. This is preparation for instroducing a rematerializable LUI+ADDI pseudoinstruction. Co-authored-by: Jesse Huang <[email protected]>
…obal addresses. This allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization. This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling. Co-authored-by: Jesse Huang <[email protected]>
@llvm/pr-subscribers-backend-risc-v Author: Craig Topper (topperc) ChangesThis allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization. This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling. Co-authored-by: Jesse Huang <[email protected]> Stacked on #93129 Patch is 647.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93142.diff 39 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index d965dd4fc9a95..7c0908f18e28b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2457,6 +2457,61 @@ static bool isWorthFoldingAdd(SDValue Add) {
return true;
}
+// To prevent SelectAddrRegImm from folding offsets that conflicts with the
+// fusion of PseudoLIAddr, check if the offset of every use of a given address
+// is within the alignment
+static bool areUserOffsetsWithinAlignment(SDValue Addr, Align Alignment) {
+ for (auto *Use : Addr->uses()) {
+ if (!Use->isMachineOpcode()) {
+ // Don't allow stores of the value. It must be used as the address.
+ if (Use->getOpcode() == ISD::STORE &&
+ cast<StoreSDNode>(Use)->getValue() == Addr)
+ return false;
+ if (Use->getOpcode() == ISD::ATOMIC_STORE &&
+ cast<AtomicSDNode>(Use)->getVal() == Addr)
+ return false;
+ // If the user is direct load/store, there is no offset.
+ if (Use->getOpcode() == ISD::LOAD || Use->getOpcode() == ISD::STORE ||
+ Use->getOpcode() == ISD::ATOMIC_LOAD ||
+ Use->getOpcode() == ISD::ATOMIC_STORE)
+ continue;
+ if (Use->getOpcode() == ISD::ADD &&
+ isa<ConstantSDNode>(Use->getOperand(1)) &&
+ Alignment > cast<ConstantSDNode>(Use->getOperand(1))->getSExtValue())
+ continue;
+
+ return false;
+ }
+
+ // If user is already selected, get offsets from load/store instructions
+ unsigned int Opcode = Use->getMachineOpcode();
+ if (Opcode == RISCV::LB || Opcode == RISCV::LBU || Opcode == RISCV::LH ||
+ Opcode == RISCV::LHU || Opcode == RISCV::LW || Opcode == RISCV::LWU ||
+ Opcode == RISCV::LD || Opcode == RISCV::FLH || Opcode == RISCV::FLW ||
+ Opcode == RISCV::FLD) {
+ if (auto *Offset = dyn_cast<ConstantSDNode>(Use->getOperand(1))) {
+ if (Offset->isZero() || Alignment > Offset->getSExtValue())
+ continue;
+ }
+ return false;
+ }
+ if (Opcode == RISCV::SB || Opcode == RISCV::SH || Opcode == RISCV::SW ||
+ Opcode == RISCV::SD || Opcode == RISCV::FSH || Opcode == RISCV::FSW ||
+ Opcode == RISCV::FSD) {
+ // Also check if Addr is used as the value of store.
+ if (Use->getOperand(0) == Addr)
+ return false;
+ if (auto *Offset = dyn_cast<ConstantSDNode>(Use->getOperand(2))) {
+ if (Offset->isZero() || Alignment > Offset->getSExtValue())
+ continue;
+ }
+ return false;
+ }
+ return false;
+ }
+
+ return true;
+}
bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
unsigned MaxShiftAmount,
SDValue &Base, SDValue &Index,
@@ -2520,9 +2575,21 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
MVT VT = Addr.getSimpleValueType();
if (Addr.getOpcode() == RISCVISD::ADD_LO) {
- Base = Addr.getOperand(0);
- Offset = Addr.getOperand(1);
- return true;
+ bool CanFold = true;
+ // Unconditionally fold if operand 1 is not a global address (e.g.
+ // externsymbol)
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(Addr.getOperand(1))) {
+ const DataLayout &DL = CurDAG->getDataLayout();
+ Align Alignment = commonAlignment(
+ GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
+ if (!areUserOffsetsWithinAlignment(Addr, Alignment))
+ CanFold = false;
+ }
+ if (CanFold) {
+ Base = Addr.getOperand(0);
+ Offset = Addr.getOperand(1);
+ return true;
+ }
}
int64_t RV32ZdinxRange = IsINX ? 4 : 0;
@@ -2541,7 +2608,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
const DataLayout &DL = CurDAG->getDataLayout();
Align Alignment = commonAlignment(
GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
- if (CVal == 0 || Alignment > CVal) {
+ if (areUserOffsetsWithinAlignment(Base, Alignment)) {
int64_t CombinedOffset = CVal + GA->getOffset();
Base = Base.getOperand(0);
Offset = CurDAG->getTargetGlobalAddress(
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 9d574edb4e6d1..8903ddc1903af 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1311,6 +1311,22 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12),
/// HI and ADD_LO address nodes.
+let Size = 8, isReMaterializable = 1 in
+def PseudoLIaddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>,
+ Sched<[WriteIALU]>;
+
+def LUIADDI : PatFrag<(ops node:$hi, node:$lo),
+ (riscv_add_lo (riscv_hi node:$hi), node:$lo)>;
+
+def : Pat<(LUIADDI tglobaladdr:$hi, tglobaladdr:$lo),
+ (PseudoLIaddr tglobaladdr:$hi, tglobaladdr:$lo)>;
+def : Pat<(LUIADDI tblockaddress:$hi, tblockaddress:$lo),
+ (PseudoLIaddr tblockaddress:$hi, tblockaddress:$lo)>;
+def : Pat<(LUIADDI tjumptable:$hi, tjumptable:$lo),
+ (PseudoLIaddr tjumptable:$hi, tjumptable:$lo)>;
+def : Pat<(LUIADDI tconstpool:$hi, tconstpool:$lo),
+ (PseudoLIaddr tconstpool:$hi, tconstpool:$lo)>;
+
def : Pat<(riscv_hi tglobaladdr:$in), (LUI tglobaladdr:$in)>;
def : Pat<(riscv_hi tblockaddress:$in), (LUI tblockaddress:$in)>;
def : Pat<(riscv_hi tjumptable:$in), (LUI tjumptable:$in)>;
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 410989177a8b9..1b8ad38682b55 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -84,7 +84,8 @@ INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE,
// 3) The offset value in the Global Address or Constant Pool is 0.
bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
MachineInstr *&Lo) {
- if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC)
+ if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC &&
+ Hi.getOpcode() != RISCV::PseudoLIaddr)
return false;
const MachineOperand &HiOp1 = Hi.getOperand(1);
@@ -97,16 +98,22 @@ bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
HiOp1.getOffset() != 0)
return false;
- Register HiDestReg = Hi.getOperand(0).getReg();
- if (!MRI->hasOneUse(HiDestReg))
- return false;
+ if (Hi.getOpcode() == RISCV::PseudoLIaddr) {
+ // Most of the code should handle it correctly without modification by
+ // setting Lo and Hi both point to PseudoLIaddr
+ Lo = &Hi;
+ } else {
+ Register HiDestReg = Hi.getOperand(0).getReg();
+ if (!MRI->hasOneUse(HiDestReg))
+ return false;
- Lo = &*MRI->use_instr_begin(HiDestReg);
- if (Lo->getOpcode() != RISCV::ADDI)
- return false;
+ Lo = &*MRI->use_instr_begin(HiDestReg);
+ if (Lo->getOpcode() != RISCV::ADDI)
+ return false;
+ }
const MachineOperand &LoOp2 = Lo->getOperand(2);
- if (Hi.getOpcode() == RISCV::LUI) {
+ if (Hi.getOpcode() == RISCV::LUI || Hi.getOpcode() == RISCV::PseudoLIaddr) {
if (LoOp2.getTargetFlags() != RISCVII::MO_LO ||
!(LoOp2.isGlobal() || LoOp2.isCPI() || LoOp2.isBlockAddress()) ||
LoOp2.getOffset() != 0)
@@ -466,6 +473,13 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
Hi.getOperand(1).setOffset(NewOffset);
MachineOperand &ImmOp = Lo.getOperand(2);
+ // Expand PseudoLIaddr into LUI
+ if (Hi.getOpcode() == RISCV::PseudoLIaddr) {
+ auto *TII = ST->getInstrInfo();
+ Hi.setDesc(TII->get(RISCV::LUI));
+ Hi.removeOperand(2);
+ }
+
if (Hi.getOpcode() != RISCV::AUIPC)
ImmOp.setOffset(NewOffset);
@@ -501,6 +515,11 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
}
}
+ // Prevent Lo (originally PseudoLIaddr, which is also pointed by Hi) from
+ // being erased
+ if (&Lo == &Hi)
+ return true;
+
MRI->replaceRegWith(Lo.getOperand(0).getReg(), Hi.getOperand(0).getReg());
Lo.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index 52f2ce27164d6..ce82fbea10063 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -44,6 +44,7 @@ class RISCVPostRAExpandPseudo : public MachineFunctionPass {
bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
bool expandMovImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool expandLIaddr(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
};
char RISCVPostRAExpandPseudo::ID = 0;
@@ -75,6 +76,8 @@ bool RISCVPostRAExpandPseudo::expandMI(MachineBasicBlock &MBB,
switch (MBBI->getOpcode()) {
case RISCV::PseudoMovImm:
return expandMovImm(MBB, MBBI);
+ case RISCV::PseudoLIaddr:
+ return expandLIaddr(MBB, MBBI);
default:
return false;
}
@@ -101,6 +104,26 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB,
return true;
}
+bool RISCVPostRAExpandPseudo::expandLIaddr(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ Register DstReg = MBBI->getOperand(0).getReg();
+ bool DstIsDead = MBBI->getOperand(0).isDead();
+ bool Renamable = MBBI->getOperand(0).isRenamable();
+
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::LUI))
+ .addReg(DstReg, RegState::Define | getRenamableRegState(Renamable))
+ .add(MBBI->getOperand(1));
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead) |
+ getRenamableRegState(Renamable))
+ .addReg(DstReg, RegState::Kill | getRenamableRegState(Renamable))
+ .add(MBBI->getOperand(2));
+ MBBI->eraseFromParent();
+ return true;
+}
+
} // end of anonymous namespace
INITIALIZE_PASS(RISCVPostRAExpandPseudo, "riscv-expand-pseudolisimm32",
diff --git a/llvm/test/CodeGen/RISCV/bfloat-mem.ll b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
index 4b6c0c29d660b..39340c85cfadc 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-mem.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
@@ -53,11 +53,11 @@ define bfloat @flh_fsh_global(bfloat %a, bfloat %b) nounwind {
; CHECK-NEXT: fadd.s fa5, fa4, fa5
; CHECK-NEXT: fcvt.bf16.s fa0, fa5
; CHECK-NEXT: lui a0, %hi(G)
-; CHECK-NEXT: flh fa5, %lo(G)(a0)
-; CHECK-NEXT: addi a1, a0, %lo(G)
-; CHECK-NEXT: fsh fa0, %lo(G)(a0)
-; CHECK-NEXT: flh fa5, 18(a1)
-; CHECK-NEXT: fsh fa0, 18(a1)
+; CHECK-NEXT: addi a0, a0, %lo(G)
+; CHECK-NEXT: flh fa5, 0(a0)
+; CHECK-NEXT: fsh fa0, 0(a0)
+; CHECK-NEXT: flh fa5, 18(a0)
+; CHECK-NEXT: fsh fa0, 18(a0)
; CHECK-NEXT: ret
%1 = fadd bfloat %a, %b
%2 = load volatile bfloat, ptr @G
diff --git a/llvm/test/CodeGen/RISCV/byval.ll b/llvm/test/CodeGen/RISCV/byval.ll
index 9151f3b03e7c2..c5e48ee75e482 100644
--- a/llvm/test/CodeGen/RISCV/byval.ll
+++ b/llvm/test/CodeGen/RISCV/byval.ll
@@ -22,15 +22,15 @@ define void @caller() nounwind {
; RV32I-NEXT: addi sp, sp, -32
; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
; RV32I-NEXT: lui a0, %hi(foo)
-; RV32I-NEXT: lw a1, %lo(foo)(a0)
-; RV32I-NEXT: sw a1, 12(sp)
; RV32I-NEXT: addi a0, a0, %lo(foo)
; RV32I-NEXT: lw a1, 12(a0)
; RV32I-NEXT: sw a1, 24(sp)
; RV32I-NEXT: lw a1, 8(a0)
; RV32I-NEXT: sw a1, 20(sp)
-; RV32I-NEXT: lw a0, 4(a0)
-; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: lw a1, 4(a0)
+; RV32I-NEXT: sw a1, 16(sp)
+; RV32I-NEXT: lw a0, 0(a0)
+; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: addi a0, sp, 12
; RV32I-NEXT: call callee
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
index 2122b3fd91788..036daf587eda0 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
@@ -28,281 +28,281 @@ define void @callee() nounwind {
; ILP32-LABEL: callee:
; ILP32: # %bb.0:
; ILP32-NEXT: lui a0, %hi(var)
-; ILP32-NEXT: flw fa5, %lo(var)(a0)
-; ILP32-NEXT: flw fa4, %lo(var+4)(a0)
-; ILP32-NEXT: flw fa3, %lo(var+8)(a0)
-; ILP32-NEXT: flw fa2, %lo(var+12)(a0)
-; ILP32-NEXT: addi a1, a0, %lo(var)
-; ILP32-NEXT: flw fa1, 16(a1)
-; ILP32-NEXT: flw fa0, 20(a1)
-; ILP32-NEXT: flw ft0, 24(a1)
-; ILP32-NEXT: flw ft1, 28(a1)
-; ILP32-NEXT: flw ft2, 32(a1)
-; ILP32-NEXT: flw ft3, 36(a1)
-; ILP32-NEXT: flw ft4, 40(a1)
-; ILP32-NEXT: flw ft5, 44(a1)
-; ILP32-NEXT: flw ft6, 48(a1)
-; ILP32-NEXT: flw ft7, 52(a1)
-; ILP32-NEXT: flw fa6, 56(a1)
-; ILP32-NEXT: flw fa7, 60(a1)
-; ILP32-NEXT: flw ft8, 64(a1)
-; ILP32-NEXT: flw ft9, 68(a1)
-; ILP32-NEXT: flw ft10, 72(a1)
-; ILP32-NEXT: flw ft11, 76(a1)
-; ILP32-NEXT: flw fs0, 80(a1)
-; ILP32-NEXT: flw fs1, 84(a1)
-; ILP32-NEXT: flw fs2, 88(a1)
-; ILP32-NEXT: flw fs3, 92(a1)
-; ILP32-NEXT: flw fs4, 96(a1)
-; ILP32-NEXT: flw fs5, 100(a1)
-; ILP32-NEXT: flw fs6, 104(a1)
-; ILP32-NEXT: flw fs7, 108(a1)
-; ILP32-NEXT: flw fs8, 124(a1)
-; ILP32-NEXT: flw fs9, 120(a1)
-; ILP32-NEXT: flw fs10, 116(a1)
-; ILP32-NEXT: flw fs11, 112(a1)
-; ILP32-NEXT: fsw fs8, 124(a1)
-; ILP32-NEXT: fsw fs9, 120(a1)
-; ILP32-NEXT: fsw fs10, 116(a1)
-; ILP32-NEXT: fsw fs11, 112(a1)
-; ILP32-NEXT: fsw fs7, 108(a1)
-; ILP32-NEXT: fsw fs6, 104(a1)
-; ILP32-NEXT: fsw fs5, 100(a1)
-; ILP32-NEXT: fsw fs4, 96(a1)
-; ILP32-NEXT: fsw fs3, 92(a1)
-; ILP32-NEXT: fsw fs2, 88(a1)
-; ILP32-NEXT: fsw fs1, 84(a1)
-; ILP32-NEXT: fsw fs0, 80(a1)
-; ILP32-NEXT: fsw ft11, 76(a1)
-; ILP32-NEXT: fsw ft10, 72(a1)
-; ILP32-NEXT: fsw ft9, 68(a1)
-; ILP32-NEXT: fsw ft8, 64(a1)
-; ILP32-NEXT: fsw fa7, 60(a1)
-; ILP32-NEXT: fsw fa6, 56(a1)
-; ILP32-NEXT: fsw ft7, 52(a1)
-; ILP32-NEXT: fsw ft6, 48(a1)
-; ILP32-NEXT: fsw ft5, 44(a1)
-; ILP32-NEXT: fsw ft4, 40(a1)
-; ILP32-NEXT: fsw ft3, 36(a1)
-; ILP32-NEXT: fsw ft2, 32(a1)
-; ILP32-NEXT: fsw ft1, 28(a1)
-; ILP32-NEXT: fsw ft0, 24(a1)
-; ILP32-NEXT: fsw fa0, 20(a1)
-; ILP32-NEXT: fsw fa1, 16(a1)
-; ILP32-NEXT: fsw fa2, %lo(var+12)(a0)
-; ILP32-NEXT: fsw fa3, %lo(var+8)(a0)
-; ILP32-NEXT: fsw fa4, %lo(var+4)(a0)
-; ILP32-NEXT: fsw fa5, %lo(var)(a0)
+; ILP32-NEXT: addi a0, a0, %lo(var)
+; ILP32-NEXT: flw fa5, 0(a0)
+; ILP32-NEXT: flw fa4, 4(a0)
+; ILP32-NEXT: flw fa3, 8(a0)
+; ILP32-NEXT: flw fa2, 12(a0)
+; ILP32-NEXT: flw fa1, 16(a0)
+; ILP32-NEXT: flw fa0, 20(a0)
+; ILP32-NEXT: flw ft0, 24(a0)
+; ILP32-NEXT: flw ft1, 28(a0)
+; ILP32-NEXT: flw ft2, 32(a0)
+; ILP32-NEXT: flw ft3, 36(a0)
+; ILP32-NEXT: flw ft4, 40(a0)
+; ILP32-NEXT: flw ft5, 44(a0)
+; ILP32-NEXT: flw ft6, 48(a0)
+; ILP32-NEXT: flw ft7, 52(a0)
+; ILP32-NEXT: flw fa6, 56(a0)
+; ILP32-NEXT: flw fa7, 60(a0)
+; ILP32-NEXT: flw ft8, 64(a0)
+; ILP32-NEXT: flw ft9, 68(a0)
+; ILP32-NEXT: flw ft10, 72(a0)
+; ILP32-NEXT: flw ft11, 76(a0)
+; ILP32-NEXT: flw fs0, 80(a0)
+; ILP32-NEXT: flw fs1, 84(a0)
+; ILP32-NEXT: flw fs2, 88(a0)
+; ILP32-NEXT: flw fs3, 92(a0)
+; ILP32-NEXT: flw fs4, 96(a0)
+; ILP32-NEXT: flw fs5, 100(a0)
+; ILP32-NEXT: flw fs6, 104(a0)
+; ILP32-NEXT: flw fs7, 108(a0)
+; ILP32-NEXT: flw fs8, 124(a0)
+; ILP32-NEXT: flw fs9, 120(a0)
+; ILP32-NEXT: flw fs10, 116(a0)
+; ILP32-NEXT: flw fs11, 112(a0)
+; ILP32-NEXT: fsw fs8, 124(a0)
+; ILP32-NEXT: fsw fs9, 120(a0)
+; ILP32-NEXT: fsw fs10, 116(a0)
+; ILP32-NEXT: fsw fs11, 112(a0)
+; ILP32-NEXT: fsw fs7, 108(a0)
+; ILP32-NEXT: fsw fs6, 104(a0)
+; ILP32-NEXT: fsw fs5, 100(a0)
+; ILP32-NEXT: fsw fs4, 96(a0)
+; ILP32-NEXT: fsw fs3, 92(a0)
+; ILP32-NEXT: fsw fs2, 88(a0)
+; ILP32-NEXT: fsw fs1, 84(a0)
+; ILP32-NEXT: fsw fs0, 80(a0)
+; ILP32-NEXT: fsw ft11, 76(a0)
+; ILP32-NEXT: fsw ft10, 72(a0)
+; ILP32-NEXT: fsw ft9, 68(a0)
+; ILP32-NEXT: fsw ft8, 64(a0)
+; ILP32-NEXT: fsw fa7, 60(a0)
+; ILP32-NEXT: fsw fa6, 56(a0)
+; ILP32-NEXT: fsw ft7, 52(a0)
+; ILP32-NEXT: fsw ft6, 48(a0)
+; ILP32-NEXT: fsw ft5, 44(a0)
+; ILP32-NEXT: fsw ft4, 40(a0)
+; ILP32-NEXT: fsw ft3, 36(a0)
+; ILP32-NEXT: fsw ft2, 32(a0)
+; ILP32-NEXT: fsw ft1, 28(a0)
+; ILP32-NEXT: fsw ft0, 24(a0)
+; ILP32-NEXT: fsw fa0, 20(a0)
+; ILP32-NEXT: fsw fa1, 16(a0)
+; ILP32-NEXT: fsw fa2, 12(a0)
+; ILP32-NEXT: fsw fa3, 8(a0)
+; ILP32-NEXT: fsw fa4, 4(a0)
+; ILP32-NEXT: fsw fa5, 0(a0)
; ILP32-NEXT: ret
;
; ILP32E-LABEL: callee:
; ILP32E: # %bb.0:
; ILP32E-NEXT: lui a0, %hi(var)
-; ILP32E-NEXT: flw fa5, %lo(var)(a0)
-; ILP32E-NEXT: flw fa4, %lo(var+4)(a0)
-; ILP32E-NEXT: flw fa3, %lo(var+8)(a0)
-; ILP32E-NEXT: flw fa2, %lo(var+12)(a0)
-; ILP32E-NEXT: addi a1, a0, %lo(var)
-; ILP32E-NEXT: flw fa1, 16(a1)
-; ILP32E-NEXT: flw fa0, 20(a1)
-; ILP32E-NEXT: flw ft0, 24(a1)
-; ILP32E-NEXT: flw ft1, 28(a1)
-; ILP32E-NEXT: flw ft2, 32(a1)
-; ILP32E-NEXT: flw ft3, 36(a1)
-; ILP32E-NEXT: flw ft4, 40(a1)
-; ILP32E-NEXT: flw ft5, 44(a1)
-; ILP32E-NEXT: flw ft6, 48(a1)
-; ILP32E-NEXT: flw ft7, 52(a1)
-; ILP32E-NEXT: flw fa6, 56(a1)
-; ILP32E-NEXT: flw fa7, 60(a1)
-; ILP32E-NEXT: flw ft8, 64(a1)
-; ILP32E-NEXT: flw ft9, 68(a1)
-; ILP32E-NEXT: flw ft10, 72(a1)
-; ILP32E-NEXT: flw ft11, 76(a1)
-; ILP32E-NEXT: flw fs0, 80(a1)
-; ILP32E-NEXT: flw fs1, 84(a1)
-; ILP32E-NEXT: flw fs2, 88(a1)
-; ILP32E-NEXT: flw fs3, 92(a1)
-; ILP32E-NEXT: flw fs4, 96(a1)
-; ILP32E-NEXT: flw fs5, 100(a1)
-; ILP32E-NEXT: flw fs6, 104(a1)
-; ILP32E-NEXT: flw fs7, 108(a1)
-; ILP32E-NEXT: flw fs8, 124(a1)
-; ILP32E-NEXT: flw fs9, 120(a1)
-; ILP32E-NEXT: flw fs10, 116(a1)
-; ILP32E-NEXT: flw fs11, 112(a1)
-; ILP32E-NEXT: fsw fs8, 124(a1)
-; ILP32E-NEXT: fsw fs9, 120(a1)
-; ILP32E-NEXT: fsw fs10, 116(a1)
-; ILP32E-NEXT: fsw fs11, 112(a1)
-; ILP32E-NEXT: fsw fs7, 108(a1)
-; ILP32E-NEXT: fsw fs6, 104(a1)
-; ILP32E-NEXT: fsw fs5, 100(a1)
-; ILP32E-NEXT: fsw fs4, 96(a1)
-; ILP32E-NEXT: fsw fs3, 92(a1)
-; ILP32E-NEXT: fsw fs2, 88(a1)
-; ILP32E-NEXT: fsw fs1, 84(a1)
-; ILP32E-NEXT: fsw fs0, 80(a1)
-; ILP32E-NEXT: fsw ft11, 76(a1)
-; ILP32E-NEXT: fsw ft10, 72(a1)
-; ILP32E-NEXT: fsw ft9, 68(a1)
-; ILP32E-NEXT: fsw ft8, 64(a1)
-; ILP32E-NEXT: fsw fa7, 60(a1)
-; ILP32E-NEXT: fsw fa6, 56(a1)
-; ILP32E-NEXT: fsw ft7, 52(a1)
-; ILP32E-NEXT: fsw ft6, 48(a1)
-; ILP32E-NEXT: fsw ft5, 44(a1)
-; ILP32E-NEXT: fsw ft4, 40(a1)
-; ILP32E-NEXT: fsw ft3, 36(a1)
-; ILP32E-NEXT: fsw ft2, 32(a1)
-; ILP32E-NEXT: fsw ft1, 28(a1)
-; ILP32E-NEXT: fsw ft0, 24(a1)
-; ILP32E-NEXT: fsw fa0, 20(a1)
-; ILP32E-NEXT: fsw fa1, 16(a1)
-; ILP32E-NEXT: fsw fa2, %lo(var+12)(a0)
-; ILP32E-NEXT: fsw fa3, %lo(var+8)(a0)
-; ILP32E-NEXT: fsw fa4, %lo(var+4)(a0)
-; ILP32E-NEXT: fsw fa5, %lo(var)(a0)
+; ILP32E-NEXT: addi a0, a0, %lo(var)
+; ILP32E-NEXT: flw fa5, 0(a0)
+; ILP32E-NEXT: flw fa4, 4(a0)
+; ILP32E-NEXT: flw fa3, 8(a0)
+; ILP32E-NEXT: flw fa2, 12(a0)
+; ILP32E-NEXT: flw fa1, 16(a0)
+; ILP32E-NEXT: flw fa0, 20(a0)
+; ILP32E-NEXT: flw ft0, 24(a0)
+; ILP32E-NEXT: flw ft1, 28(a0)
+; ILP32E-NEXT: flw ft2, 32(a0)
+; ILP32E-NEXT: flw ft3, 36(a0)
+; ILP32E-NEXT: flw ft4, 40(a0)
+; ILP32E-NEXT: flw ft5, 44(a0)
+; ILP32E-NEXT: flw ft6, 48(a0)
+; ILP32E-NEXT: flw ft7, 52(a0)
+; ILP32E-NEXT: flw fa6, 56(a0)
+; ILP32E-NEXT: flw fa7, 60(a0)
+; ILP32E-NEXT: flw ft8, 64(a0)
+; ILP32E-NEXT: flw ft9, 68(a0)
+; ILP32E-NEXT: flw ft10, 72(a0)
+; ILP32E-NEXT: flw ft11, 76(a0)
+; ILP32E-NEXT: flw fs0, 80(a0)
+; ILP32E-NEXT: flw fs1, 84(a0)
+; ILP32E-NEXT: flw fs2, 88(a0)
+; ILP32E-NEXT: flw fs3, 92(a0)
+; ILP32E-NEXT: flw fs4, 96(a0)
+; ILP32E-NEXT: flw fs5, 100(a0)
+; ILP32E-NEXT: flw fs6, 1...
[truncated]
|
@dtcxzyw can you do any measurements on this? |
@@ -1311,6 +1311,22 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12), | |||
|
|||
/// HI and ADD_LO address nodes. | |||
|
|||
let Size = 8, isReMaterializable = 1 in | |||
def PseudoLIaddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about PseudoLIAddr
or PseudoLoadAddr
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe PseudoLUIADDI? I agree that LIaddr isn't super clear.
You should also add a comment above the pseudo which describes its purpose.
@@ -1311,6 +1311,22 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12), | |||
|
|||
/// HI and ADD_LO address nodes. | |||
|
|||
let Size = 8, isReMaterializable = 1 in | |||
def PseudoLIaddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>, | |||
Sched<[WriteIALU]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't we need SchedRead
s here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are no register operands so they wouldn't be associated with anything.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know if we should model the second instruction ADDI
here.
I am sorry I cannot help. It is weird to me that checksums of llvm-test-suite binaries don't change after this patch :( |
That is weird to me too. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
General idea makes sense, and I see nothing concerning in the diffs.
@@ -1311,6 +1311,22 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12), | |||
|
|||
/// HI and ADD_LO address nodes. | |||
|
|||
let Size = 8, isReMaterializable = 1 in | |||
def PseudoLIaddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe PseudoLUIADDI? I agree that LIaddr isn't super clear.
You should also add a comment above the pseudo which describes its purpose.
…obal addresses. This allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization. This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling. This improves the dynamic instruction count on 531.deepsjeng_r from spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a 1% improvement. There are couple regressions, but they are 0.1% or smaller. AArch64 has similar pseudo instructions like MOVaddr This is similar to llvm#93142 but uses a one use check in the PatFrag instead of llvm#93129.
This allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization.
This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling.
This improves the dynamic instruction count on 531.deepsjeng_r from spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a 1% improvement. There are couple regressions, but they are 0.1% or smaller.
AArch64 has similar pseudo instructions like
MOVaddr
Co-authored-by: Jesse Huang [email protected]
Stacked on #93129