Skip to content

[RISCV] Add register allocation hints for lui/auipc+addi fusion. #123860

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,26 @@ bool RISCVRegisterInfo::getRegAllocationHints(
tryAddHint(MO, MI.getOperand(0), NeedGPRC);
}
}

// Add a hint if it would allow auipc/lui+addi(w) fusion.
if ((MI.getOpcode() == RISCV::ADDIW || MI.getOpcode() == RISCV::ADDI) &&
MI.getOperand(1).isReg()) {
const MachineBasicBlock &MBB = *MI.getParent();
MachineBasicBlock::const_iterator I = MI.getIterator();
// Is the previous instruction a LUI or AUIPC that can be fused?
if (I != MBB.begin()) {
I = skipDebugInstructionsBackward(std::prev(I), MBB.begin());
if (((I->getOpcode() == RISCV::LUI && Subtarget.hasLUIADDIFusion()) ||
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder where these two fusions are general/common enough that we should just enable this by default. Even for code not sufficiently compiled with such a fusion in mind, it has a high chance of running on a machine with these fusions. Though I guess, this is really more asking if this should be in the generic tuning, and is a bit orthogonal from this patch isn't it?

(I->getOpcode() == RISCV::AUIPC &&
Subtarget.hasAUIPCADDIFusion())) &&
I->getOperand(0).getReg() == MI.getOperand(1).getReg()) {
if (OpIdx == 0)
tryAddHint(MO, MI.getOperand(1), /*NeedGPRC=*/false);
else
tryAddHint(MO, MI.getOperand(0), /*NeedGPRC=*/false);
}
}
}
}

for (MCPhysReg OrderReg : Order)
Expand Down
166 changes: 112 additions & 54 deletions llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -relocation-model=pic -verify-machineinstrs < %s \
; RUN: | FileCheck -check-prefixes=RV32I %s
; RUN: | FileCheck -check-prefixes=RV32I,RV32NOFUSION %s
; RUN: llc -mtriple=riscv64 -relocation-model=pic -verify-machineinstrs < %s \
; RUN: | FileCheck -check-prefixes=RV64I %s
; RUN: | FileCheck -check-prefixes=RV64I,RV64NOFUSION %s
; RUN: llc -mtriple=riscv32 -relocation-model=pic -verify-machineinstrs < %s \
; RUN: -mattr=+auipc-addi-fusion | FileCheck -check-prefixes=RV32I,RV32FUSION %s
; RUN: llc -mtriple=riscv64 -relocation-model=pic -verify-machineinstrs < %s \
; RUN: -mattr=+auipc-addi-fusion | FileCheck -check-prefixes=RV64I,RV64FUSION %s

; Verifies that MachineLICM can hoist address generation pseudos out of loops.

Expand Down Expand Up @@ -141,59 +145,113 @@ ret:
@gd = external thread_local global i32

define void @test_la_tls_gd(i32 signext %n) nounwind {
; RV32I-LABEL: test_la_tls_gd:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: addi sp, sp, -16
; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a0
; RV32I-NEXT: li s2, 0
; RV32I-NEXT: .Lpcrel_hi3:
; RV32I-NEXT: auipc a0, %tls_gd_pcrel_hi(gd)
; RV32I-NEXT: addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
; RV32I-NEXT: .LBB3_1: # %loop
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: call __tls_get_addr
; RV32I-NEXT: lw zero, 0(a0)
; RV32I-NEXT: addi s2, s2, 1
; RV32I-NEXT: blt s2, s0, .LBB3_1
; RV32I-NEXT: # %bb.2: # %ret
; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
; RV32NOFUSION-LABEL: test_la_tls_gd:
; RV32NOFUSION: # %bb.0: # %entry
; RV32NOFUSION-NEXT: addi sp, sp, -16
; RV32NOFUSION-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32NOFUSION-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32NOFUSION-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
; RV32NOFUSION-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32NOFUSION-NEXT: mv s0, a0
; RV32NOFUSION-NEXT: li s2, 0
; RV32NOFUSION-NEXT: .Lpcrel_hi3:
; RV32NOFUSION-NEXT: auipc a0, %tls_gd_pcrel_hi(gd)
; RV32NOFUSION-NEXT: addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
; RV32NOFUSION-NEXT: .LBB3_1: # %loop
; RV32NOFUSION-NEXT: # =>This Inner Loop Header: Depth=1
; RV32NOFUSION-NEXT: mv a0, s1
; RV32NOFUSION-NEXT: call __tls_get_addr
; RV32NOFUSION-NEXT: lw zero, 0(a0)
; RV32NOFUSION-NEXT: addi s2, s2, 1
; RV32NOFUSION-NEXT: blt s2, s0, .LBB3_1
; RV32NOFUSION-NEXT: # %bb.2: # %ret
; RV32NOFUSION-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32NOFUSION-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32NOFUSION-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32NOFUSION-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32NOFUSION-NEXT: addi sp, sp, 16
; RV32NOFUSION-NEXT: ret
;
; RV64I-LABEL: test_la_tls_gd:
; RV64I: # %bb.0: # %entry
; RV64I-NEXT: addi sp, sp, -32
; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: li s2, 0
; RV64I-NEXT: .Lpcrel_hi3:
; RV64I-NEXT: auipc a0, %tls_gd_pcrel_hi(gd)
; RV64I-NEXT: addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
; RV64I-NEXT: .LBB3_1: # %loop
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: mv a0, s1
; RV64I-NEXT: call __tls_get_addr
; RV64I-NEXT: lw zero, 0(a0)
; RV64I-NEXT: addiw s2, s2, 1
; RV64I-NEXT: blt s2, s0, .LBB3_1
; RV64I-NEXT: # %bb.2: # %ret
; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
; RV64I-NEXT: addi sp, sp, 32
; RV64I-NEXT: ret
; RV64NOFUSION-LABEL: test_la_tls_gd:
; RV64NOFUSION: # %bb.0: # %entry
; RV64NOFUSION-NEXT: addi sp, sp, -32
; RV64NOFUSION-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64NOFUSION-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; RV64NOFUSION-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64NOFUSION-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
; RV64NOFUSION-NEXT: mv s0, a0
; RV64NOFUSION-NEXT: li s2, 0
; RV64NOFUSION-NEXT: .Lpcrel_hi3:
; RV64NOFUSION-NEXT: auipc a0, %tls_gd_pcrel_hi(gd)
; RV64NOFUSION-NEXT: addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
; RV64NOFUSION-NEXT: .LBB3_1: # %loop
; RV64NOFUSION-NEXT: # =>This Inner Loop Header: Depth=1
; RV64NOFUSION-NEXT: mv a0, s1
; RV64NOFUSION-NEXT: call __tls_get_addr
; RV64NOFUSION-NEXT: lw zero, 0(a0)
; RV64NOFUSION-NEXT: addiw s2, s2, 1
; RV64NOFUSION-NEXT: blt s2, s0, .LBB3_1
; RV64NOFUSION-NEXT: # %bb.2: # %ret
; RV64NOFUSION-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; RV64NOFUSION-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; RV64NOFUSION-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; RV64NOFUSION-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
; RV64NOFUSION-NEXT: addi sp, sp, 32
; RV64NOFUSION-NEXT: ret
;
; RV32FUSION-LABEL: test_la_tls_gd:
; RV32FUSION: # %bb.0: # %entry
; RV32FUSION-NEXT: addi sp, sp, -16
; RV32FUSION-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32FUSION-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32FUSION-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
; RV32FUSION-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32FUSION-NEXT: mv s0, a0
; RV32FUSION-NEXT: li s2, 0
; RV32FUSION-NEXT: .Lpcrel_hi3:
; RV32FUSION-NEXT: auipc s1, %tls_gd_pcrel_hi(gd)
; RV32FUSION-NEXT: addi s1, s1, %pcrel_lo(.Lpcrel_hi3)
; RV32FUSION-NEXT: .LBB3_1: # %loop
; RV32FUSION-NEXT: # =>This Inner Loop Header: Depth=1
; RV32FUSION-NEXT: mv a0, s1
; RV32FUSION-NEXT: call __tls_get_addr
; RV32FUSION-NEXT: lw zero, 0(a0)
; RV32FUSION-NEXT: addi s2, s2, 1
; RV32FUSION-NEXT: blt s2, s0, .LBB3_1
; RV32FUSION-NEXT: # %bb.2: # %ret
; RV32FUSION-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32FUSION-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32FUSION-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32FUSION-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32FUSION-NEXT: addi sp, sp, 16
; RV32FUSION-NEXT: ret
;
; RV64FUSION-LABEL: test_la_tls_gd:
; RV64FUSION: # %bb.0: # %entry
; RV64FUSION-NEXT: addi sp, sp, -32
; RV64FUSION-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64FUSION-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; RV64FUSION-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64FUSION-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
; RV64FUSION-NEXT: mv s0, a0
; RV64FUSION-NEXT: li s2, 0
; RV64FUSION-NEXT: .Lpcrel_hi3:
; RV64FUSION-NEXT: auipc s1, %tls_gd_pcrel_hi(gd)
; RV64FUSION-NEXT: addi s1, s1, %pcrel_lo(.Lpcrel_hi3)
; RV64FUSION-NEXT: .LBB3_1: # %loop
; RV64FUSION-NEXT: # =>This Inner Loop Header: Depth=1
; RV64FUSION-NEXT: mv a0, s1
; RV64FUSION-NEXT: call __tls_get_addr
; RV64FUSION-NEXT: lw zero, 0(a0)
; RV64FUSION-NEXT: addiw s2, s2, 1
; RV64FUSION-NEXT: blt s2, s0, .LBB3_1
; RV64FUSION-NEXT: # %bb.2: # %ret
; RV64FUSION-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; RV64FUSION-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; RV64FUSION-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
; RV64FUSION-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
; RV64FUSION-NEXT: addi sp, sp, 32
; RV64FUSION-NEXT: ret
entry:
br label %loop

Expand Down
48 changes: 48 additions & 0 deletions llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
;RUN: llc < %s -mtriple=riscv64 -mattr=+f,+lui-addi-fusion,+use-postra-scheduler -mcpu=sifive-u74 \
;RUN: -misched-postra-direction=bidirectional -target-abi=lp64f \
;RUN: | FileCheck %s --check-prefixes=FUSION-POSTRA,FUSION-POSTRA-BIDIRECTIONAL
;RUN: llc < %s -mtriple=riscv64 -mattr=+f,+lui-addi-fusion -target-abi=lp64f \
;RUN: | FileCheck %s --check-prefix=FUSION-GENERIC

@.str = private constant [4 x i8] c"%f\0A\00", align 1

Expand Down Expand Up @@ -50,6 +52,13 @@ define void @foo(i32 signext %0, i32 signext %1) {
; FUSION-POSTRA-BIDIRECTIONAL-NEXT: addi a0, a0, %lo(.L.str)
; FUSION-POSTRA-BIDIRECTIONAL-NEXT: fcvt.s.w fa0, a1
; FUSION-POSTRA-BIDIRECTIONAL-NEXT: tail bar
;
; FUSION-GENERIC-LABEL: foo:
; FUSION-GENERIC: # %bb.0:
; FUSION-GENERIC-NEXT: fcvt.s.w fa0, a1
; FUSION-GENERIC-NEXT: lui a0, %hi(.L.str)
; FUSION-GENERIC-NEXT: addi a0, a0, %lo(.L.str)
; FUSION-GENERIC-NEXT: tail bar
%3 = sitofp i32 %1 to float
tail call void @bar(ptr @.str, float %3)
ret void
Expand All @@ -76,5 +85,44 @@ define i32 @test_matint() {
; FUSION-POSTRA-NEXT: lui a0, 1
; FUSION-POSTRA-NEXT: addiw a0, a0, -2048
; FUSION-POSTRA-NEXT: ret
;
; FUSION-GENERIC-LABEL: test_matint:
; FUSION-GENERIC: # %bb.0:
; FUSION-GENERIC-NEXT: lui a0, 1
; FUSION-GENERIC-NEXT: addiw a0, a0, -2048
; FUSION-GENERIC-NEXT: ret
ret i32 2048
}

define void @test_regalloc_hint(i32 noundef signext %0, i32 noundef signext %1) {
; NOFUSION-LABEL: test_regalloc_hint:
; NOFUSION: # %bb.0:
; NOFUSION-NEXT: mv a0, a1
; NOFUSION-NEXT: lui a1, 3014
; NOFUSION-NEXT: addiw a1, a1, 334
; NOFUSION-NEXT: tail bar
;
; FUSION-LABEL: test_regalloc_hint:
; FUSION: # %bb.0:
; FUSION-NEXT: mv a0, a1
; FUSION-NEXT: lui a1, 3014
; FUSION-NEXT: addiw a1, a1, 334
; FUSION-NEXT: tail bar
;
; FUSION-POSTRA-LABEL: test_regalloc_hint:
; FUSION-POSTRA: # %bb.0:
; FUSION-POSTRA-NEXT: mv a0, a1
; FUSION-POSTRA-NEXT: lui a1, 3014
; FUSION-POSTRA-NEXT: addiw a1, a1, 334
; FUSION-POSTRA-NEXT: tail bar
;
; FUSION-GENERIC-LABEL: test_regalloc_hint:
; FUSION-GENERIC: # %bb.0:
; FUSION-GENERIC-NEXT: lui a2, 3014
; FUSION-GENERIC-NEXT: addiw a2, a2, 334
; FUSION-GENERIC-NEXT: mv a0, a1
; FUSION-GENERIC-NEXT: mv a1, a2
; FUSION-GENERIC-NEXT: tail bar
tail call void @bar(i32 noundef signext %1, i32 noundef signext 12345678)
ret void
}
Loading