Skip to content

Commit aa34a6a

Browse files
authored
[RISCV] Add register allocation hints for lui/auipc+addi fusion. (#123860)
Spotted the auipc case while looking at code for P550. I'm not sure this is the right long term fix. We're still missing rematerialization opportunities for these pairs so a pseudo might be better. That would interfere with folding auipc+add into load/store addressing though. Fixes #76779.
1 parent 539b2e0 commit aa34a6a

File tree

3 files changed

+180
-54
lines changed

3 files changed

+180
-54
lines changed

llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -926,6 +926,26 @@ bool RISCVRegisterInfo::getRegAllocationHints(
926926
tryAddHint(MO, MI.getOperand(0), NeedGPRC);
927927
}
928928
}
929+
930+
// Add a hint if it would allow auipc/lui+addi(w) fusion.
931+
if ((MI.getOpcode() == RISCV::ADDIW || MI.getOpcode() == RISCV::ADDI) &&
932+
MI.getOperand(1).isReg()) {
933+
const MachineBasicBlock &MBB = *MI.getParent();
934+
MachineBasicBlock::const_iterator I = MI.getIterator();
935+
// Is the previous instruction a LUI or AUIPC that can be fused?
936+
if (I != MBB.begin()) {
937+
I = skipDebugInstructionsBackward(std::prev(I), MBB.begin());
938+
if (((I->getOpcode() == RISCV::LUI && Subtarget.hasLUIADDIFusion()) ||
939+
(I->getOpcode() == RISCV::AUIPC &&
940+
Subtarget.hasAUIPCADDIFusion())) &&
941+
I->getOperand(0).getReg() == MI.getOperand(1).getReg()) {
942+
if (OpIdx == 0)
943+
tryAddHint(MO, MI.getOperand(1), /*NeedGPRC=*/false);
944+
else
945+
tryAddHint(MO, MI.getOperand(0), /*NeedGPRC=*/false);
946+
}
947+
}
948+
}
929949
}
930950

931951
for (MCPhysReg OrderReg : Order)

llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll

Lines changed: 112 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=riscv32 -relocation-model=pic -verify-machineinstrs < %s \
3-
; RUN: | FileCheck -check-prefixes=RV32I %s
3+
; RUN: | FileCheck -check-prefixes=RV32I,RV32NOFUSION %s
44
; RUN: llc -mtriple=riscv64 -relocation-model=pic -verify-machineinstrs < %s \
5-
; RUN: | FileCheck -check-prefixes=RV64I %s
5+
; RUN: | FileCheck -check-prefixes=RV64I,RV64NOFUSION %s
6+
; RUN: llc -mtriple=riscv32 -relocation-model=pic -verify-machineinstrs < %s \
7+
; RUN: -mattr=+auipc-addi-fusion | FileCheck -check-prefixes=RV32I,RV32FUSION %s
8+
; RUN: llc -mtriple=riscv64 -relocation-model=pic -verify-machineinstrs < %s \
9+
; RUN: -mattr=+auipc-addi-fusion | FileCheck -check-prefixes=RV64I,RV64FUSION %s
610

711
; Verifies that MachineLICM can hoist address generation pseudos out of loops.
812

@@ -141,59 +145,113 @@ ret:
141145
@gd = external thread_local global i32
142146

143147
define void @test_la_tls_gd(i32 signext %n) nounwind {
144-
; RV32I-LABEL: test_la_tls_gd:
145-
; RV32I: # %bb.0: # %entry
146-
; RV32I-NEXT: addi sp, sp, -16
147-
; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
148-
; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
149-
; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
150-
; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
151-
; RV32I-NEXT: mv s0, a0
152-
; RV32I-NEXT: li s2, 0
153-
; RV32I-NEXT: .Lpcrel_hi3:
154-
; RV32I-NEXT: auipc a0, %tls_gd_pcrel_hi(gd)
155-
; RV32I-NEXT: addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
156-
; RV32I-NEXT: .LBB3_1: # %loop
157-
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
158-
; RV32I-NEXT: mv a0, s1
159-
; RV32I-NEXT: call __tls_get_addr
160-
; RV32I-NEXT: lw zero, 0(a0)
161-
; RV32I-NEXT: addi s2, s2, 1
162-
; RV32I-NEXT: blt s2, s0, .LBB3_1
163-
; RV32I-NEXT: # %bb.2: # %ret
164-
; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
165-
; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
166-
; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
167-
; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
168-
; RV32I-NEXT: addi sp, sp, 16
169-
; RV32I-NEXT: ret
148+
; RV32NOFUSION-LABEL: test_la_tls_gd:
149+
; RV32NOFUSION: # %bb.0: # %entry
150+
; RV32NOFUSION-NEXT: addi sp, sp, -16
151+
; RV32NOFUSION-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
152+
; RV32NOFUSION-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
153+
; RV32NOFUSION-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
154+
; RV32NOFUSION-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
155+
; RV32NOFUSION-NEXT: mv s0, a0
156+
; RV32NOFUSION-NEXT: li s2, 0
157+
; RV32NOFUSION-NEXT: .Lpcrel_hi3:
158+
; RV32NOFUSION-NEXT: auipc a0, %tls_gd_pcrel_hi(gd)
159+
; RV32NOFUSION-NEXT: addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
160+
; RV32NOFUSION-NEXT: .LBB3_1: # %loop
161+
; RV32NOFUSION-NEXT: # =>This Inner Loop Header: Depth=1
162+
; RV32NOFUSION-NEXT: mv a0, s1
163+
; RV32NOFUSION-NEXT: call __tls_get_addr
164+
; RV32NOFUSION-NEXT: lw zero, 0(a0)
165+
; RV32NOFUSION-NEXT: addi s2, s2, 1
166+
; RV32NOFUSION-NEXT: blt s2, s0, .LBB3_1
167+
; RV32NOFUSION-NEXT: # %bb.2: # %ret
168+
; RV32NOFUSION-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
169+
; RV32NOFUSION-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
170+
; RV32NOFUSION-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
171+
; RV32NOFUSION-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
172+
; RV32NOFUSION-NEXT: addi sp, sp, 16
173+
; RV32NOFUSION-NEXT: ret
170174
;
171-
; RV64I-LABEL: test_la_tls_gd:
172-
; RV64I: # %bb.0: # %entry
173-
; RV64I-NEXT: addi sp, sp, -32
174-
; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
175-
; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
176-
; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
177-
; RV64I-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
178-
; RV64I-NEXT: mv s0, a0
179-
; RV64I-NEXT: li s2, 0
180-
; RV64I-NEXT: .Lpcrel_hi3:
181-
; RV64I-NEXT: auipc a0, %tls_gd_pcrel_hi(gd)
182-
; RV64I-NEXT: addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
183-
; RV64I-NEXT: .LBB3_1: # %loop
184-
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
185-
; RV64I-NEXT: mv a0, s1
186-
; RV64I-NEXT: call __tls_get_addr
187-
; RV64I-NEXT: lw zero, 0(a0)
188-
; RV64I-NEXT: addiw s2, s2, 1
189-
; RV64I-NEXT: blt s2, s0, .LBB3_1
190-
; RV64I-NEXT: # %bb.2: # %ret
191-
; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
192-
; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
193-
; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
194-
; RV64I-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
195-
; RV64I-NEXT: addi sp, sp, 32
196-
; RV64I-NEXT: ret
175+
; RV64NOFUSION-LABEL: test_la_tls_gd:
176+
; RV64NOFUSION: # %bb.0: # %entry
177+
; RV64NOFUSION-NEXT: addi sp, sp, -32
178+
; RV64NOFUSION-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
179+
; RV64NOFUSION-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
180+
; RV64NOFUSION-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
181+
; RV64NOFUSION-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
182+
; RV64NOFUSION-NEXT: mv s0, a0
183+
; RV64NOFUSION-NEXT: li s2, 0
184+
; RV64NOFUSION-NEXT: .Lpcrel_hi3:
185+
; RV64NOFUSION-NEXT: auipc a0, %tls_gd_pcrel_hi(gd)
186+
; RV64NOFUSION-NEXT: addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
187+
; RV64NOFUSION-NEXT: .LBB3_1: # %loop
188+
; RV64NOFUSION-NEXT: # =>This Inner Loop Header: Depth=1
189+
; RV64NOFUSION-NEXT: mv a0, s1
190+
; RV64NOFUSION-NEXT: call __tls_get_addr
191+
; RV64NOFUSION-NEXT: lw zero, 0(a0)
192+
; RV64NOFUSION-NEXT: addiw s2, s2, 1
193+
; RV64NOFUSION-NEXT: blt s2, s0, .LBB3_1
194+
; RV64NOFUSION-NEXT: # %bb.2: # %ret
195+
; RV64NOFUSION-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
196+
; RV64NOFUSION-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
197+
; RV64NOFUSION-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
198+
; RV64NOFUSION-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
199+
; RV64NOFUSION-NEXT: addi sp, sp, 32
200+
; RV64NOFUSION-NEXT: ret
201+
;
202+
; RV32FUSION-LABEL: test_la_tls_gd:
203+
; RV32FUSION: # %bb.0: # %entry
204+
; RV32FUSION-NEXT: addi sp, sp, -16
205+
; RV32FUSION-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
206+
; RV32FUSION-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
207+
; RV32FUSION-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
208+
; RV32FUSION-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
209+
; RV32FUSION-NEXT: mv s0, a0
210+
; RV32FUSION-NEXT: li s2, 0
211+
; RV32FUSION-NEXT: .Lpcrel_hi3:
212+
; RV32FUSION-NEXT: auipc s1, %tls_gd_pcrel_hi(gd)
213+
; RV32FUSION-NEXT: addi s1, s1, %pcrel_lo(.Lpcrel_hi3)
214+
; RV32FUSION-NEXT: .LBB3_1: # %loop
215+
; RV32FUSION-NEXT: # =>This Inner Loop Header: Depth=1
216+
; RV32FUSION-NEXT: mv a0, s1
217+
; RV32FUSION-NEXT: call __tls_get_addr
218+
; RV32FUSION-NEXT: lw zero, 0(a0)
219+
; RV32FUSION-NEXT: addi s2, s2, 1
220+
; RV32FUSION-NEXT: blt s2, s0, .LBB3_1
221+
; RV32FUSION-NEXT: # %bb.2: # %ret
222+
; RV32FUSION-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
223+
; RV32FUSION-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
224+
; RV32FUSION-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
225+
; RV32FUSION-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
226+
; RV32FUSION-NEXT: addi sp, sp, 16
227+
; RV32FUSION-NEXT: ret
228+
;
229+
; RV64FUSION-LABEL: test_la_tls_gd:
230+
; RV64FUSION: # %bb.0: # %entry
231+
; RV64FUSION-NEXT: addi sp, sp, -32
232+
; RV64FUSION-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
233+
; RV64FUSION-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
234+
; RV64FUSION-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
235+
; RV64FUSION-NEXT: sd s2, 0(sp) # 8-byte Folded Spill
236+
; RV64FUSION-NEXT: mv s0, a0
237+
; RV64FUSION-NEXT: li s2, 0
238+
; RV64FUSION-NEXT: .Lpcrel_hi3:
239+
; RV64FUSION-NEXT: auipc s1, %tls_gd_pcrel_hi(gd)
240+
; RV64FUSION-NEXT: addi s1, s1, %pcrel_lo(.Lpcrel_hi3)
241+
; RV64FUSION-NEXT: .LBB3_1: # %loop
242+
; RV64FUSION-NEXT: # =>This Inner Loop Header: Depth=1
243+
; RV64FUSION-NEXT: mv a0, s1
244+
; RV64FUSION-NEXT: call __tls_get_addr
245+
; RV64FUSION-NEXT: lw zero, 0(a0)
246+
; RV64FUSION-NEXT: addiw s2, s2, 1
247+
; RV64FUSION-NEXT: blt s2, s0, .LBB3_1
248+
; RV64FUSION-NEXT: # %bb.2: # %ret
249+
; RV64FUSION-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
250+
; RV64FUSION-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
251+
; RV64FUSION-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
252+
; RV64FUSION-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
253+
; RV64FUSION-NEXT: addi sp, sp, 32
254+
; RV64FUSION-NEXT: ret
197255
entry:
198256
br label %loop
199257

llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
;RUN: llc < %s -mtriple=riscv64 -mattr=+f,+lui-addi-fusion,+use-postra-scheduler -mcpu=sifive-u74 \
1313
;RUN: -misched-postra-direction=bidirectional -target-abi=lp64f \
1414
;RUN: | FileCheck %s --check-prefixes=FUSION-POSTRA,FUSION-POSTRA-BIDIRECTIONAL
15+
;RUN: llc < %s -mtriple=riscv64 -mattr=+f,+lui-addi-fusion -target-abi=lp64f \
16+
;RUN: | FileCheck %s --check-prefix=FUSION-GENERIC
1517

1618
@.str = private constant [4 x i8] c"%f\0A\00", align 1
1719

@@ -50,6 +52,13 @@ define void @foo(i32 signext %0, i32 signext %1) {
5052
; FUSION-POSTRA-BIDIRECTIONAL-NEXT: addi a0, a0, %lo(.L.str)
5153
; FUSION-POSTRA-BIDIRECTIONAL-NEXT: fcvt.s.w fa0, a1
5254
; FUSION-POSTRA-BIDIRECTIONAL-NEXT: tail bar
55+
;
56+
; FUSION-GENERIC-LABEL: foo:
57+
; FUSION-GENERIC: # %bb.0:
58+
; FUSION-GENERIC-NEXT: fcvt.s.w fa0, a1
59+
; FUSION-GENERIC-NEXT: lui a0, %hi(.L.str)
60+
; FUSION-GENERIC-NEXT: addi a0, a0, %lo(.L.str)
61+
; FUSION-GENERIC-NEXT: tail bar
5362
%3 = sitofp i32 %1 to float
5463
tail call void @bar(ptr @.str, float %3)
5564
ret void
@@ -76,5 +85,44 @@ define i32 @test_matint() {
7685
; FUSION-POSTRA-NEXT: lui a0, 1
7786
; FUSION-POSTRA-NEXT: addiw a0, a0, -2048
7887
; FUSION-POSTRA-NEXT: ret
88+
;
89+
; FUSION-GENERIC-LABEL: test_matint:
90+
; FUSION-GENERIC: # %bb.0:
91+
; FUSION-GENERIC-NEXT: lui a0, 1
92+
; FUSION-GENERIC-NEXT: addiw a0, a0, -2048
93+
; FUSION-GENERIC-NEXT: ret
7994
ret i32 2048
8095
}
96+
97+
define void @test_regalloc_hint(i32 noundef signext %0, i32 noundef signext %1) {
98+
; NOFUSION-LABEL: test_regalloc_hint:
99+
; NOFUSION: # %bb.0:
100+
; NOFUSION-NEXT: mv a0, a1
101+
; NOFUSION-NEXT: lui a1, 3014
102+
; NOFUSION-NEXT: addiw a1, a1, 334
103+
; NOFUSION-NEXT: tail bar
104+
;
105+
; FUSION-LABEL: test_regalloc_hint:
106+
; FUSION: # %bb.0:
107+
; FUSION-NEXT: mv a0, a1
108+
; FUSION-NEXT: lui a1, 3014
109+
; FUSION-NEXT: addiw a1, a1, 334
110+
; FUSION-NEXT: tail bar
111+
;
112+
; FUSION-POSTRA-LABEL: test_regalloc_hint:
113+
; FUSION-POSTRA: # %bb.0:
114+
; FUSION-POSTRA-NEXT: mv a0, a1
115+
; FUSION-POSTRA-NEXT: lui a1, 3014
116+
; FUSION-POSTRA-NEXT: addiw a1, a1, 334
117+
; FUSION-POSTRA-NEXT: tail bar
118+
;
119+
; FUSION-GENERIC-LABEL: test_regalloc_hint:
120+
; FUSION-GENERIC: # %bb.0:
121+
; FUSION-GENERIC-NEXT: lui a2, 3014
122+
; FUSION-GENERIC-NEXT: addiw a2, a2, 334
123+
; FUSION-GENERIC-NEXT: mv a0, a1
124+
; FUSION-GENERIC-NEXT: mv a1, a2
125+
; FUSION-GENERIC-NEXT: tail bar
126+
tail call void @bar(i32 noundef signext %1, i32 noundef signext 12345678)
127+
ret void
128+
}

0 commit comments

Comments
 (0)