-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Slightly improve expanded multiply emulation in getVLENFactoredAmount. #84113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Craig Topper (topperc) ChangesInstead of initializing the accumulator to 0. Initialize it on first Fix a missing kill flag on the final Add. I have no real interest in this case, just an easy optimization I noticed. Stacked on #84110 I should probably figure out how to use spr. Full diff: https://github.com/llvm/llvm-project/pull/84113.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2abe015c9f9cdc..dc1585686b6447 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3059,11 +3059,11 @@ void RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
"Reserve the stack by the multiple of one vector size.");
MachineRegisterInfo &MRI = MF.getRegInfo();
- int64_t NumOfVReg = Amount / 8;
+ assert(isInt<32>(Amount / 8) &&
+ "Expect the number of vector registers within 32-bits.");
+ uint32_t NumOfVReg = Amount / 8;
BuildMI(MBB, II, DL, get(RISCV::PseudoReadVLENB), DestReg).setMIFlag(Flag);
- assert(isInt<32>(NumOfVReg) &&
- "Expect the number of vector registers within 32-bits.");
if (llvm::has_single_bit<uint32_t>(NumOfVReg)) {
uint32_t ShiftAmount = Log2_32(NumOfVReg);
if (ShiftAmount == 0)
@@ -3130,30 +3130,37 @@ void RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
.addReg(N, RegState::Kill)
.setMIFlag(Flag);
} else {
- Register Acc = MRI.createVirtualRegister(&RISCV::GPRRegClass);
- BuildMI(MBB, II, DL, get(RISCV::ADDI), Acc)
- .addReg(RISCV::X0)
- .addImm(0)
- .setMIFlag(Flag);
+ Register Acc;
uint32_t PrevShiftAmount = 0;
for (uint32_t ShiftAmount = 0; NumOfVReg >> ShiftAmount; ShiftAmount++) {
- if (NumOfVReg & (1LL << ShiftAmount)) {
+ if (NumOfVReg & (1U << ShiftAmount)) {
if (ShiftAmount)
BuildMI(MBB, II, DL, get(RISCV::SLLI), DestReg)
.addReg(DestReg, RegState::Kill)
.addImm(ShiftAmount - PrevShiftAmount)
.setMIFlag(Flag);
- if (NumOfVReg >> (ShiftAmount + 1))
- BuildMI(MBB, II, DL, get(RISCV::ADD), Acc)
- .addReg(Acc, RegState::Kill)
- .addReg(DestReg)
- .setMIFlag(Flag);
+ if (NumOfVReg >> (ShiftAmount + 1)) {
+ // If we don't have an accmulator yet, create it and copy DestReg.
+ if (!Acc) {
+ Acc = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, II, DL, get(RISCV::ADDI), Acc)
+ .addReg(DestReg)
+ .addImm(0)
+ .setMIFlag(Flag);
+ } else {
+ BuildMI(MBB, II, DL, get(RISCV::ADD), Acc)
+ .addReg(Acc, RegState::Kill)
+ .addReg(DestReg)
+ .setMIFlag(Flag);
+ }
+ }
PrevShiftAmount = ShiftAmount;
}
}
+ assert(Acc && "Expected valid accumulator");
BuildMI(MBB, II, DL, get(RISCV::ADD), DestReg)
.addReg(DestReg, RegState::Kill)
- .addReg(Acc)
+ .addReg(Acc, RegState::Kill)
.setMIFlag(Flag);
}
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
index 78bec6c68c3f6e..466ab085b266b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
@@ -253,9 +253,8 @@ define void @lmul4_and_2_x2_1() nounwind {
; NOMUL-NEXT: sd s0, 32(sp) # 8-byte Folded Spill
; NOMUL-NEXT: addi s0, sp, 48
; NOMUL-NEXT: csrr a0, vlenb
-; NOMUL-NEXT: li a1, 0
; NOMUL-NEXT: slli a0, a0, 2
-; NOMUL-NEXT: add a1, a1, a0
+; NOMUL-NEXT: mv a1, a0
; NOMUL-NEXT: slli a0, a0, 1
; NOMUL-NEXT: add a0, a0, a1
; NOMUL-NEXT: sub sp, sp, a0
@@ -455,9 +454,8 @@ define void @lmul_8_x5() nounwind {
; NOMUL-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; NOMUL-NEXT: addi s0, sp, 80
; NOMUL-NEXT: csrr a0, vlenb
-; NOMUL-NEXT: li a1, 0
; NOMUL-NEXT: slli a0, a0, 3
-; NOMUL-NEXT: add a1, a1, a0
+; NOMUL-NEXT: mv a1, a0
; NOMUL-NEXT: slli a0, a0, 2
; NOMUL-NEXT: add a0, a0, a1
; NOMUL-NEXT: sub sp, sp, a0
@@ -517,9 +515,8 @@ define void @lmul_8_x9() nounwind {
; NOMUL-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; NOMUL-NEXT: addi s0, sp, 80
; NOMUL-NEXT: csrr a0, vlenb
-; NOMUL-NEXT: li a1, 0
; NOMUL-NEXT: slli a0, a0, 3
-; NOMUL-NEXT: add a1, a1, a0
+; NOMUL-NEXT: mv a1, a0
; NOMUL-NEXT: slli a0, a0, 3
; NOMUL-NEXT: add a0, a0, a1
; NOMUL-NEXT: sub sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 855e280164a25c..68740eec56e4c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -2133,9 +2133,8 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 0
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
@@ -2256,9 +2255,8 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
; CHECK-NEXT: vfmin.vv v8, v11, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 0
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a1, a1, a0
+; CHECK-NEXT: mv a1, a0
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
@@ -2739,9 +2737,8 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 0
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
@@ -2852,9 +2849,8 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
; CHECK-NEXT: vfmin.vv v8, v11, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 0
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a1, a1, a0
+; CHECK-NEXT: mv a1, a0
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
@@ -3461,9 +3457,8 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 0
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
@@ -3584,9 +3579,8 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
; CHECK-NEXT: vfmax.vv v8, v11, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 0
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a1, a1, a0
+; CHECK-NEXT: mv a1, a0
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
@@ -4067,9 +4061,8 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 0
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
@@ -4180,9 +4173,8 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
; CHECK-NEXT: vfmax.vv v8, v11, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 0
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a1, a1, a0
+; CHECK-NEXT: mv a1, a0
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 6a7ec6dc5bd7df..e5bef20fd9e24d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -193,9 +193,8 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
; VLA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; VLA-NEXT: vmv1r.v v16, v15
; VLA-NEXT: csrr a0, vlenb
-; VLA-NEXT: li a1, 0
; VLA-NEXT: slli a0, a0, 3
-; VLA-NEXT: add a1, a1, a0
+; VLA-NEXT: mv a1, a0
; VLA-NEXT: slli a0, a0, 1
; VLA-NEXT: add a0, a0, a1
; VLA-NEXT: add a0, sp, a0
@@ -245,9 +244,8 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
; VLA-NEXT: li a0, 32
; VLA-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; VLA-NEXT: csrr a0, vlenb
-; VLA-NEXT: li a1, 0
; VLA-NEXT: slli a0, a0, 3
-; VLA-NEXT: add a1, a1, a0
+; VLA-NEXT: mv a1, a0
; VLA-NEXT: slli a0, a0, 1
; VLA-NEXT: add a0, a0, a1
; VLA-NEXT: add a0, sp, a0
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Acc = MRI.createVirtualRegister(&RISCV::GPRRegClass); | ||
BuildMI(MBB, II, DL, get(RISCV::ADDI), Acc) | ||
.addReg(DestReg) | ||
.addImm(0) | ||
.setMIFlag(Flag); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could also do
Acc = MRI.createVirtualRegister(&RISCV::GPRRegClass); | |
BuildMI(MBB, II, DL, get(RISCV::ADDI), Acc) | |
.addReg(DestReg) | |
.addImm(0) | |
.setMIFlag(Flag); | |
Acc = MRI.createVirtualRegister(&RISCV::GPRRegClass); | |
BuildMI(MBB, II, DL, get(RISCV::COPY), Acc) | |
.addReg(DestReg) | |
.setMIFlag(Flag); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…edAmount. Instead of initializing the accumulator to 0. Initialize it on first assignment with a mv from the register that holds VLENB << ShiftAmount. Fix a missing kill flag on the final Add. I have no real interest in this case, just an easy optimization I noticed.
5115c7b
to
268b6e9
Compare
Instead of initializing the accumulator to 0. Initialize it on first
assignment with a mv from the register that holds VLENB << ShiftAmount.
Fix a missing kill flag on the final Add.
I have no real interest in this case, just an easy optimization I noticed.
Stacked on #84110 I should probably figure out how to use spr.