-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Add a rematerializable pseudo instruction for LUI+ADDI for global addresses. #93352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…obal addresses. This allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization. This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling. This improves the dynamic instruction count on 531.deepsjeng_r from spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a 1% improvement. There are couple regressions, but they are 0.1% or smaller. AArch64 has similar pseudo instructions like MOVaddr This is similar to llvm#93142 but uses a one use check in the PatFrag instead of llvm#93129.
@llvm/pr-subscribers-backend-risc-v Author: Craig Topper (topperc) ChangesThis allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization. This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling. This improves the dynamic instruction count on 531.deepsjeng_r from spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a 1% improvement. There are couple regressions, but they are 0.1% or smaller. AArch64 has similar pseudo instructions like MOVaddr This is similar to #93142 but uses a one use check in the PatFrag instead of #93129. Patch is 58.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93352.diff 19 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 9d574edb4e6d1..50c8b6b2a9610 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1311,6 +1311,26 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12),
/// HI and ADD_LO address nodes.
+// Pseudo for a rematerializable LUI+ADDI sequence for loading an address.
+// It will be expanded after register allocation.
+// FIXME: The scheduling information does not reflect the multiple instructions.
+let Size = 8, isReMaterializable = 1 in
+def PseudoMovAddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>,
+ Sched<[WriteIALU]>;
+
+def riscv_hi_oneuse : unop_oneuse<riscv_hi>;
+def addr_hi_lo : PatFrag<(ops node:$hi, node:$lo),
+ (riscv_add_lo (riscv_hi_oneuse node:$hi), node:$lo)>;
+
+def : Pat<(addr_hi_lo tglobaladdr:$hi, tglobaladdr:$lo),
+ (PseudoMovAddr tglobaladdr:$hi, tglobaladdr:$lo)>;
+def : Pat<(addr_hi_lo tblockaddress:$hi, tblockaddress:$lo),
+ (PseudoMovAddr tblockaddress:$hi, tblockaddress:$lo)>;
+def : Pat<(addr_hi_lo tjumptable:$hi, tjumptable:$lo),
+ (PseudoMovAddr tjumptable:$hi, tjumptable:$lo)>;
+def : Pat<(addr_hi_lo tconstpool:$hi, tconstpool:$lo),
+ (PseudoMovAddr tconstpool:$hi, tconstpool:$lo)>;
+
def : Pat<(riscv_hi tglobaladdr:$in), (LUI tglobaladdr:$in)>;
def : Pat<(riscv_hi tblockaddress:$in), (LUI tblockaddress:$in)>;
def : Pat<(riscv_hi tjumptable:$in), (LUI tjumptable:$in)>;
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 410989177a8b9..fecc83a821f42 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -84,7 +84,8 @@ INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE,
// 3) The offset value in the Global Address or Constant Pool is 0.
bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
MachineInstr *&Lo) {
- if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC)
+ if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC &&
+ Hi.getOpcode() != RISCV::PseudoMovAddr)
return false;
const MachineOperand &HiOp1 = Hi.getOperand(1);
@@ -97,16 +98,22 @@ bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
HiOp1.getOffset() != 0)
return false;
- Register HiDestReg = Hi.getOperand(0).getReg();
- if (!MRI->hasOneUse(HiDestReg))
- return false;
+ if (Hi.getOpcode() == RISCV::PseudoMovAddr) {
+ // Most of the code should handle it correctly without modification by
+ // setting Lo and Hi both point to PseudoMovAddr
+ Lo = &Hi;
+ } else {
+ Register HiDestReg = Hi.getOperand(0).getReg();
+ if (!MRI->hasOneUse(HiDestReg))
+ return false;
- Lo = &*MRI->use_instr_begin(HiDestReg);
- if (Lo->getOpcode() != RISCV::ADDI)
- return false;
+ Lo = &*MRI->use_instr_begin(HiDestReg);
+ if (Lo->getOpcode() != RISCV::ADDI)
+ return false;
+ }
const MachineOperand &LoOp2 = Lo->getOperand(2);
- if (Hi.getOpcode() == RISCV::LUI) {
+ if (Hi.getOpcode() == RISCV::LUI || Hi.getOpcode() == RISCV::PseudoMovAddr) {
if (LoOp2.getTargetFlags() != RISCVII::MO_LO ||
!(LoOp2.isGlobal() || LoOp2.isCPI() || LoOp2.isBlockAddress()) ||
LoOp2.getOffset() != 0)
@@ -466,6 +473,13 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
Hi.getOperand(1).setOffset(NewOffset);
MachineOperand &ImmOp = Lo.getOperand(2);
+ // Expand PseudoMovAddr into LUI
+ if (Hi.getOpcode() == RISCV::PseudoMovAddr) {
+ auto *TII = ST->getInstrInfo();
+ Hi.setDesc(TII->get(RISCV::LUI));
+ Hi.removeOperand(2);
+ }
+
if (Hi.getOpcode() != RISCV::AUIPC)
ImmOp.setOffset(NewOffset);
@@ -501,6 +515,11 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
}
}
+ // Prevent Lo (originally PseudoMovAddr, which is also pointed by Hi) from
+ // being erased
+ if (&Lo == &Hi)
+ return true;
+
MRI->replaceRegWith(Lo.getOperand(0).getReg(), Hi.getOperand(0).getReg());
Lo.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index 52f2ce27164d6..b8c22f737ca4e 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -44,6 +44,7 @@ class RISCVPostRAExpandPseudo : public MachineFunctionPass {
bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
bool expandMovImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool expandLIaddr(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
};
char RISCVPostRAExpandPseudo::ID = 0;
@@ -75,6 +76,8 @@ bool RISCVPostRAExpandPseudo::expandMI(MachineBasicBlock &MBB,
switch (MBBI->getOpcode()) {
case RISCV::PseudoMovImm:
return expandMovImm(MBB, MBBI);
+ case RISCV::PseudoMovAddr:
+ return expandLIaddr(MBB, MBBI);
default:
return false;
}
@@ -101,6 +104,26 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB,
return true;
}
+bool RISCVPostRAExpandPseudo::expandLIaddr(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ Register DstReg = MBBI->getOperand(0).getReg();
+ bool DstIsDead = MBBI->getOperand(0).isDead();
+ bool Renamable = MBBI->getOperand(0).isRenamable();
+
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::LUI))
+ .addReg(DstReg, RegState::Define | getRenamableRegState(Renamable))
+ .add(MBBI->getOperand(1));
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead) |
+ getRenamableRegState(Renamable))
+ .addReg(DstReg, RegState::Kill | getRenamableRegState(Renamable))
+ .add(MBBI->getOperand(2));
+ MBBI->eraseFromParent();
+ return true;
+}
+
} // end of anonymous namespace
INITIALIZE_PASS(RISCVPostRAExpandPseudo, "riscv-expand-pseudolisimm32",
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 549d531e829ea..a90c244437a03 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -383,8 +383,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: mv a1, s3
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT: addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT: lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0)
; RV32I-NEXT: neg a0, s2
; RV32I-NEXT: and a0, s2, a0
; RV32I-NEXT: mv a1, s3
@@ -442,9 +442,9 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV32M-LABEL: test_cttz_i64:
; RV32M: # %bb.0:
; RV32M-NEXT: lui a2, 30667
-; RV32M-NEXT: addi a2, a2, 1329
-; RV32M-NEXT: lui a3, %hi(.LCPI3_0)
-; RV32M-NEXT: addi a3, a3, %lo(.LCPI3_0)
+; RV32M-NEXT: addi a3, a2, 1329
+; RV32M-NEXT: lui a2, %hi(.LCPI3_0)
+; RV32M-NEXT: addi a2, a2, %lo(.LCPI3_0)
; RV32M-NEXT: bnez a1, .LBB3_3
; RV32M-NEXT: # %bb.1:
; RV32M-NEXT: li a1, 32
@@ -452,18 +452,18 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV32M-NEXT: .LBB3_2:
; RV32M-NEXT: neg a1, a0
; RV32M-NEXT: and a0, a0, a1
-; RV32M-NEXT: mul a0, a0, a2
+; RV32M-NEXT: mul a0, a0, a3
; RV32M-NEXT: srli a0, a0, 27
-; RV32M-NEXT: add a0, a3, a0
+; RV32M-NEXT: add a0, a2, a0
; RV32M-NEXT: lbu a0, 0(a0)
; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
; RV32M-NEXT: .LBB3_3:
; RV32M-NEXT: neg a4, a1
; RV32M-NEXT: and a1, a1, a4
-; RV32M-NEXT: mul a1, a1, a2
+; RV32M-NEXT: mul a1, a1, a3
; RV32M-NEXT: srli a1, a1, 27
-; RV32M-NEXT: add a1, a3, a1
+; RV32M-NEXT: add a1, a2, a1
; RV32M-NEXT: lbu a1, 0(a1)
; RV32M-NEXT: bnez a0, .LBB3_2
; RV32M-NEXT: .LBB3_4:
@@ -814,8 +814,8 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
; RV32I-NEXT: mv a1, s3
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: lui a0, %hi(.LCPI7_0)
-; RV32I-NEXT: addi s4, a0, %lo(.LCPI7_0)
+; RV32I-NEXT: lui s4, %hi(.LCPI7_0)
+; RV32I-NEXT: addi s4, s4, %lo(.LCPI7_0)
; RV32I-NEXT: neg a0, s1
; RV32I-NEXT: and a0, s1, a0
; RV32I-NEXT: mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index 9ae30e646fdbf..fe6e20d852d59 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -48,8 +48,8 @@ define signext i32 @ctz_dereferencing_pointer(ptr %b) nounwind {
; RV32I-NEXT: mv a1, s1
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: lui a0, %hi(.LCPI0_0)
-; RV32I-NEXT: addi s3, a0, %lo(.LCPI0_0)
+; RV32I-NEXT: lui s3, %hi(.LCPI0_0)
+; RV32I-NEXT: addi s3, s3, %lo(.LCPI0_0)
; RV32I-NEXT: neg a0, s4
; RV32I-NEXT: and a0, s4, a0
; RV32I-NEXT: mv a1, s1
@@ -511,8 +511,8 @@ define signext i32 @ctz4(i64 %b) nounwind {
; RV32I-NEXT: mv a1, s3
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui a0, %hi(.LCPI6_0)
-; RV32I-NEXT: addi s4, a0, %lo(.LCPI6_0)
+; RV32I-NEXT: lui s4, %hi(.LCPI6_0)
+; RV32I-NEXT: addi s4, s4, %lo(.LCPI6_0)
; RV32I-NEXT: neg a0, s2
; RV32I-NEXT: and a0, s2, a0
; RV32I-NEXT: mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index eb6ac985287a1..478d2eae9dca2 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -24,31 +24,31 @@ define void @_Z3foov() {
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_49)
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_49)
; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma
-; CHECK-NEXT: vle16.v v10, (a0)
+; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_48)
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_48)
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v10, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_46)
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_46)
-; CHECK-NEXT: vle16.v v12, (a0)
+; CHECK-NEXT: vle16.v v10, (a0)
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_45)
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_45)
-; CHECK-NEXT: vle16.v v14, (a0)
+; CHECK-NEXT: vle16.v v12, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: vs2r.v v12, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: vs2r.v v14, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vs2r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_40)
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 3c2e84689c979..62b1549a5d58a 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -389,8 +389,8 @@ define dso_local i32 @load_ga() local_unnamed_addr #0 {
define dso_local i64 @load_ga_8() nounwind {
; RV32I-LABEL: load_ga_8:
; RV32I: # %bb.0: # %entry
-; RV32I-NEXT: lui a0, %hi(ga_8)
-; RV32I-NEXT: addi a1, a0, %lo(ga_8)
+; RV32I-NEXT: lui a1, %hi(ga_8)
+; RV32I-NEXT: addi a1, a1, %lo(ga_8)
; RV32I-NEXT: lw a0, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
; RV32I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index b45ab135fa1c7..197366e7e05fe 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -209,8 +209,8 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: mv a1, s3
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT: addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT: lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0)
; RV32I-NEXT: neg a0, s2
; RV32I-NEXT: and a0, s2, a0
; RV32I-NEXT: mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 7e6c3f9c87d27..f25aa0de89da8 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -199,8 +199,8 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: mv a1, s3
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT: addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT: lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0)
; RV32I-NEXT: neg a0, s2
; RV32I-NEXT: and a0, s2, a0
; RV32I-NEXT: mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 9cb3991f31f94..08b310213d16e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -126,28 +126,28 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vid.v v8
; CHECK-NEXT: vsaddu.vx v8, v8, a1
-; CHECK-NEXT: vmsltu.vx v0, v8, a2
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
-; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: vmsltu.vx v0, v8, a2
; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT: vle8.v v9, (a0)
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsext.vf8 v24, v16
+; CHECK-NEXT: vsaddu.vx v16, v24, a1
+; CHECK-NEXT: vmsltu.vx v9, v16, a2
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v8, v16, a2
-; CHECK-NEXT: vsext.vf8 v16, v9
-; CHECK-NEXT: vsaddu.vx v16, v16, a1
; CHECK-NEXT: lui a0, %hi(.LCPI9_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2)
-; CHECK-NEXT: vle8.v v9, (a0)
+; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vmsltu.vx v10, v16, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vi v0, v8, 2
+; CHECK-NEXT: vslideup.vi v0, v9, 2
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v0, v10, 4
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vsext.vf8 v16, v9
+; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
@@ -169,13 +169,13 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v10, v16, a2
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
; CHECK-NEXT: vsext.vf8 v16, v9
; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v8, v16, a2
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
; CHECK-NEXT: vle8.v v9, (a0)
+; CHECK-NEXT: vmsltu.vx v10, v16, a2
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
; CHECK-NEXT: vle8.v v11, (a0)
@@ -187,10 +187,10 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vmsltu.vx v11, v16, a2
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
; CHECK-NEXT: vle8.v v12, (a0)
+; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
; CHECK-NEXT: vle8.v v13, (a0)
@@ -201,27 +201,27 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vsaddu.vx v16, v16, a1
; CHECK-NEXT: vmsltu.vx v13, v16, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 2
+; CHECK-NEXT: vslideup.vi v10, v8, 2
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v9, 4
+; CHECK-NEXT: vslideup.vi v10, v9, 4
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vle8.v v9, (a0)
+; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v11, 6
+; CHECK-NEXT: vslideup.vi v10, v11, 6
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v0, v12, 2
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v0, v13, 4
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vsext.vf8 v16, v9
+; CHECK-NEXT: vsext.vf8 v16, v8
; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v9, v16, a2
+; CHECK-NEXT: vmsltu.vx v8, v16, a2
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vi v0, v9, 6
+; CHECK-NEXT: vslideup.vi v0, v8, 6
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v0, v8, 8
+; CHECK-NEXT: vslideup.vi v0, v10, 8
; CHECK-NEXT: ret
%mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 79c36a629465d..f4d7074c7f6b2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -3459,6 +3459,8 @@ define void @mulhu_v4i64(ptr %x) {
; RV64-NEXT: lui a1, %hi(.LCPI184_0)
; RV64-NEXT: addi a1, a1, %lo(.LCPI184_0)
; RV64-NEXT: vle64.v v10, (a1)
+; RV64-NEXT: vmulhu.vv v10, v8, v10
+; RV64-NEXT: vsub.vv v8, v8, v10
; RV64-NEXT: li a1, -1
; RV64-NEXT: slli a1, a1, 63
; RV64-NEXT: vmv.s.x v12, a1
@@ -3466,8 +3468,6 @@ define void @mulhu_v4i64(ptr %x) {
; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma
; RV64-NEXT: vslideup.vi v14, v12, 2
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vmulhu.vv v10, v8, v10
-; RV64-NEXT: vsub.vv v8, v8, v10
; RV64-NEXT: vmulhu.vv v8, v8, v14
; RV64-NEXT: vadd.vv v8, v8, v10
; RV64-NEXT: lui a1, 12320
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 178a920169ad9..bc3e135a588a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -159,17 +159,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 82
+; RV32-NEXT: li a3, 80
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 82 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb
; RV32-NEXT: addi a3, a1, 256
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vle32.v v16, (a3)
; RV32-NEXT: csr...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM w/minor comment.
Note - I see this as mostly a stepping stone and a way to make forward progress. If we want to keep discussing removing the one use in favor of #93142, I'm happy to keep doing so.
@@ -44,6 +44,7 @@ class RISCVPostRAExpandPseudo : public MachineFunctionPass { | |||
bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, | |||
MachineBasicBlock::iterator &NextMBBI); | |||
bool expandMovImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); | |||
bool expandLIaddr(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
expandMovAddr just for consistency.
✅ With the latest revision this PR passed the C/C++ code formatter. |
…obal addresses. (llvm#93352) This allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization. This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling. This improves the dynamic instruction count on 531.deepsjeng_r from spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a 1% improvement. There are couple regressions, but they are 0.1% or smaller. AArch64 has similar pseudo instructions like MOVaddr
This allows register allocation to rematerialize these instead of spilling and reloading. We need to make it a single instruction due to limitations in rematerialization.
This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA scheduling.
This improves the dynamic instruction count on 531.deepsjeng_r from spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a 1% improvement. There are couple regressions, but they are 0.1% or smaller.
AArch64 has similar pseudo instructions like MOVaddr
This is similar to #93142 but uses a one use check in the PatFrag instead of #93129.