Skip to content

Commit 64db738

Browse files
committed
[RISCV] Exploit sh3add/sh2add for stack offsets by shifted 12-bit constants
If we're falling back to generic constant formation in a register + add/sub, we can check if we have a constant which is 12-bits but left shifted by 2 or 3. If so, we can use a sh2add or sh3add to perform the shift and add in a single instruction. This is profitable when the unshifted constant would require two instructions (LUI/ADDI) to form, and neutral if we'd need at least one uncompressed instruction. We need to avoid this case when c.lui could be used to form the immediate to avoid a size regression. c.lui/c.li are our only compressed single instruction immediate cases, and the c.li can't reach here. Since stacks are aligned to 16 bytes by default, sh3add allows addresing (aligned) data out to 2^14 (i.e. 16kb) in at most two instructions w/zba.
1 parent f5cf98c commit 64db738

File tree

3 files changed

+481
-242
lines changed

3 files changed

+481
-242
lines changed

llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,31 @@ void RISCVRegisterInfo::adjustReg(MachineBasicBlock &MBB,
248248
return;
249249
}
250250

251+
// Use shNadd if doing so lets us materialize a 12 bit immediate with a single
252+
// instruction. This saves 1 instruction over the full lui/addi+add fallback
253+
// path. We avoid anything which can be done with a single lui as it night
254+
// be compressible. Note that the sh1add case is fully covered by the 2x addi
255+
// case just above and is thus ommitted.
256+
if (ST.hasStdExtZba() && (Val & 0xFFF) != 0) {
257+
unsigned Opc = 0;
258+
if (isShiftedInt<12, 3>(Val)) {
259+
Opc = RISCV::SH3ADD;
260+
Val = Val >> 3;
261+
} else if (isShiftedInt<12, 2>(Val)) {
262+
Opc = RISCV::SH2ADD;
263+
Val = Val >> 2;
264+
}
265+
if (Opc) {
266+
Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
267+
TII->movImm(MBB, II, DL, ScratchReg, Val, Flag);
268+
BuildMI(MBB, II, DL, TII->get(Opc), DestReg)
269+
.addReg(ScratchReg, RegState::Kill)
270+
.addReg(SrcReg, getKillRegState(KillSrcReg))
271+
.setMIFlag(Flag);
272+
return;
273+
}
274+
}
275+
251276
unsigned Opc = RISCV::ADD;
252277
if (Val < 0) {
253278
Val = -Val;

llvm/test/CodeGen/RISCV/prolog-epilogue.ll

Lines changed: 210 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -182,43 +182,77 @@ define void @frame_4kb() {
182182
}
183183

184184
define void @frame_4kb_offset_128() {
185-
; RV32-LABEL: frame_4kb_offset_128:
186-
; RV32: # %bb.0:
187-
; RV32-NEXT: addi sp, sp, -2032
188-
; RV32-NEXT: .cfi_def_cfa_offset 2032
189-
; RV32-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
190-
; RV32-NEXT: .cfi_offset ra, -4
191-
; RV32-NEXT: lui a0, 1
192-
; RV32-NEXT: addi a0, a0, 128
193-
; RV32-NEXT: sub sp, sp, a0
194-
; RV32-NEXT: .cfi_def_cfa_offset 6256
195-
; RV32-NEXT: addi a0, sp, 12
196-
; RV32-NEXT: call callee
197-
; RV32-NEXT: lui a0, 1
198-
; RV32-NEXT: addi a0, a0, 128
199-
; RV32-NEXT: add sp, sp, a0
200-
; RV32-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
201-
; RV32-NEXT: addi sp, sp, 2032
202-
; RV32-NEXT: ret
185+
; RV32I-LABEL: frame_4kb_offset_128:
186+
; RV32I: # %bb.0:
187+
; RV32I-NEXT: addi sp, sp, -2032
188+
; RV32I-NEXT: .cfi_def_cfa_offset 2032
189+
; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
190+
; RV32I-NEXT: .cfi_offset ra, -4
191+
; RV32I-NEXT: lui a0, 1
192+
; RV32I-NEXT: addi a0, a0, 128
193+
; RV32I-NEXT: sub sp, sp, a0
194+
; RV32I-NEXT: .cfi_def_cfa_offset 6256
195+
; RV32I-NEXT: addi a0, sp, 12
196+
; RV32I-NEXT: call callee
197+
; RV32I-NEXT: lui a0, 1
198+
; RV32I-NEXT: addi a0, a0, 128
199+
; RV32I-NEXT: add sp, sp, a0
200+
; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
201+
; RV32I-NEXT: addi sp, sp, 2032
202+
; RV32I-NEXT: ret
203203
;
204-
; RV64-LABEL: frame_4kb_offset_128:
205-
; RV64: # %bb.0:
206-
; RV64-NEXT: addi sp, sp, -2032
207-
; RV64-NEXT: .cfi_def_cfa_offset 2032
208-
; RV64-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
209-
; RV64-NEXT: .cfi_offset ra, -8
210-
; RV64-NEXT: lui a0, 1
211-
; RV64-NEXT: addiw a0, a0, 128
212-
; RV64-NEXT: sub sp, sp, a0
213-
; RV64-NEXT: .cfi_def_cfa_offset 6256
214-
; RV64-NEXT: addi a0, sp, 8
215-
; RV64-NEXT: call callee
216-
; RV64-NEXT: lui a0, 1
217-
; RV64-NEXT: addiw a0, a0, 128
218-
; RV64-NEXT: add sp, sp, a0
219-
; RV64-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
220-
; RV64-NEXT: addi sp, sp, 2032
221-
; RV64-NEXT: ret
204+
; RV32ZBA-LABEL: frame_4kb_offset_128:
205+
; RV32ZBA: # %bb.0:
206+
; RV32ZBA-NEXT: addi sp, sp, -2032
207+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 2032
208+
; RV32ZBA-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
209+
; RV32ZBA-NEXT: .cfi_offset ra, -4
210+
; RV32ZBA-NEXT: li a0, -528
211+
; RV32ZBA-NEXT: sh3add sp, a0, sp
212+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 6256
213+
; RV32ZBA-NEXT: addi a0, sp, 12
214+
; RV32ZBA-NEXT: call callee
215+
; RV32ZBA-NEXT: li a0, 528
216+
; RV32ZBA-NEXT: sh3add sp, a0, sp
217+
; RV32ZBA-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
218+
; RV32ZBA-NEXT: addi sp, sp, 2032
219+
; RV32ZBA-NEXT: ret
220+
;
221+
; RV64I-LABEL: frame_4kb_offset_128:
222+
; RV64I: # %bb.0:
223+
; RV64I-NEXT: addi sp, sp, -2032
224+
; RV64I-NEXT: .cfi_def_cfa_offset 2032
225+
; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
226+
; RV64I-NEXT: .cfi_offset ra, -8
227+
; RV64I-NEXT: lui a0, 1
228+
; RV64I-NEXT: addiw a0, a0, 128
229+
; RV64I-NEXT: sub sp, sp, a0
230+
; RV64I-NEXT: .cfi_def_cfa_offset 6256
231+
; RV64I-NEXT: addi a0, sp, 8
232+
; RV64I-NEXT: call callee
233+
; RV64I-NEXT: lui a0, 1
234+
; RV64I-NEXT: addiw a0, a0, 128
235+
; RV64I-NEXT: add sp, sp, a0
236+
; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
237+
; RV64I-NEXT: addi sp, sp, 2032
238+
; RV64I-NEXT: ret
239+
;
240+
; RV64ZBA-LABEL: frame_4kb_offset_128:
241+
; RV64ZBA: # %bb.0:
242+
; RV64ZBA-NEXT: addi sp, sp, -2032
243+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 2032
244+
; RV64ZBA-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
245+
; RV64ZBA-NEXT: .cfi_offset ra, -8
246+
; RV64ZBA-NEXT: li a0, -528
247+
; RV64ZBA-NEXT: sh3add sp, a0, sp
248+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 6256
249+
; RV64ZBA-NEXT: addi a0, sp, 8
250+
; RV64ZBA-NEXT: call callee
251+
; RV64ZBA-NEXT: li a0, 528
252+
; RV64ZBA-NEXT: sh3add sp, a0, sp
253+
; RV64ZBA-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
254+
; RV64ZBA-NEXT: addi sp, sp, 2032
255+
; RV64ZBA-NEXT: ret
222256
%a = alloca [6240 x i8]
223257
call void @callee(ptr %a)
224258
ret void
@@ -266,86 +300,154 @@ define void @frame_8kb() {
266300
}
267301

268302
define void @frame_8kb_offset_128() {
269-
; RV32-LABEL: frame_8kb_offset_128:
270-
; RV32: # %bb.0:
271-
; RV32-NEXT: addi sp, sp, -2032
272-
; RV32-NEXT: .cfi_def_cfa_offset 2032
273-
; RV32-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
274-
; RV32-NEXT: .cfi_offset ra, -4
275-
; RV32-NEXT: lui a0, 2
276-
; RV32-NEXT: addi a0, a0, 128
277-
; RV32-NEXT: sub sp, sp, a0
278-
; RV32-NEXT: .cfi_def_cfa_offset 10352
279-
; RV32-NEXT: addi a0, sp, 12
280-
; RV32-NEXT: call callee
281-
; RV32-NEXT: lui a0, 2
282-
; RV32-NEXT: addi a0, a0, 128
283-
; RV32-NEXT: add sp, sp, a0
284-
; RV32-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
285-
; RV32-NEXT: addi sp, sp, 2032
286-
; RV32-NEXT: ret
303+
; RV32I-LABEL: frame_8kb_offset_128:
304+
; RV32I: # %bb.0:
305+
; RV32I-NEXT: addi sp, sp, -2032
306+
; RV32I-NEXT: .cfi_def_cfa_offset 2032
307+
; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
308+
; RV32I-NEXT: .cfi_offset ra, -4
309+
; RV32I-NEXT: lui a0, 2
310+
; RV32I-NEXT: addi a0, a0, 128
311+
; RV32I-NEXT: sub sp, sp, a0
312+
; RV32I-NEXT: .cfi_def_cfa_offset 10352
313+
; RV32I-NEXT: addi a0, sp, 12
314+
; RV32I-NEXT: call callee
315+
; RV32I-NEXT: lui a0, 2
316+
; RV32I-NEXT: addi a0, a0, 128
317+
; RV32I-NEXT: add sp, sp, a0
318+
; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
319+
; RV32I-NEXT: addi sp, sp, 2032
320+
; RV32I-NEXT: ret
287321
;
288-
; RV64-LABEL: frame_8kb_offset_128:
289-
; RV64: # %bb.0:
290-
; RV64-NEXT: addi sp, sp, -2032
291-
; RV64-NEXT: .cfi_def_cfa_offset 2032
292-
; RV64-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
293-
; RV64-NEXT: .cfi_offset ra, -8
294-
; RV64-NEXT: lui a0, 2
295-
; RV64-NEXT: addiw a0, a0, 128
296-
; RV64-NEXT: sub sp, sp, a0
297-
; RV64-NEXT: .cfi_def_cfa_offset 10352
298-
; RV64-NEXT: addi a0, sp, 8
299-
; RV64-NEXT: call callee
300-
; RV64-NEXT: lui a0, 2
301-
; RV64-NEXT: addiw a0, a0, 128
302-
; RV64-NEXT: add sp, sp, a0
303-
; RV64-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
304-
; RV64-NEXT: addi sp, sp, 2032
305-
; RV64-NEXT: ret
322+
; RV32ZBA-LABEL: frame_8kb_offset_128:
323+
; RV32ZBA: # %bb.0:
324+
; RV32ZBA-NEXT: addi sp, sp, -2032
325+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 2032
326+
; RV32ZBA-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
327+
; RV32ZBA-NEXT: .cfi_offset ra, -4
328+
; RV32ZBA-NEXT: li a0, -1040
329+
; RV32ZBA-NEXT: sh3add sp, a0, sp
330+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 10352
331+
; RV32ZBA-NEXT: addi a0, sp, 12
332+
; RV32ZBA-NEXT: call callee
333+
; RV32ZBA-NEXT: li a0, 1040
334+
; RV32ZBA-NEXT: sh3add sp, a0, sp
335+
; RV32ZBA-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
336+
; RV32ZBA-NEXT: addi sp, sp, 2032
337+
; RV32ZBA-NEXT: ret
338+
;
339+
; RV64I-LABEL: frame_8kb_offset_128:
340+
; RV64I: # %bb.0:
341+
; RV64I-NEXT: addi sp, sp, -2032
342+
; RV64I-NEXT: .cfi_def_cfa_offset 2032
343+
; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
344+
; RV64I-NEXT: .cfi_offset ra, -8
345+
; RV64I-NEXT: lui a0, 2
346+
; RV64I-NEXT: addiw a0, a0, 128
347+
; RV64I-NEXT: sub sp, sp, a0
348+
; RV64I-NEXT: .cfi_def_cfa_offset 10352
349+
; RV64I-NEXT: addi a0, sp, 8
350+
; RV64I-NEXT: call callee
351+
; RV64I-NEXT: lui a0, 2
352+
; RV64I-NEXT: addiw a0, a0, 128
353+
; RV64I-NEXT: add sp, sp, a0
354+
; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
355+
; RV64I-NEXT: addi sp, sp, 2032
356+
; RV64I-NEXT: ret
357+
;
358+
; RV64ZBA-LABEL: frame_8kb_offset_128:
359+
; RV64ZBA: # %bb.0:
360+
; RV64ZBA-NEXT: addi sp, sp, -2032
361+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 2032
362+
; RV64ZBA-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
363+
; RV64ZBA-NEXT: .cfi_offset ra, -8
364+
; RV64ZBA-NEXT: li a0, -1040
365+
; RV64ZBA-NEXT: sh3add sp, a0, sp
366+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 10352
367+
; RV64ZBA-NEXT: addi a0, sp, 8
368+
; RV64ZBA-NEXT: call callee
369+
; RV64ZBA-NEXT: li a0, 1040
370+
; RV64ZBA-NEXT: sh3add sp, a0, sp
371+
; RV64ZBA-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
372+
; RV64ZBA-NEXT: addi sp, sp, 2032
373+
; RV64ZBA-NEXT: ret
306374
%a = alloca [10336 x i8]
307375
call void @callee(ptr %a)
308376
ret void
309377
}
310378

311379
define void @frame_16kb_minus_80() {
312-
; RV32-LABEL: frame_16kb_minus_80:
313-
; RV32: # %bb.0:
314-
; RV32-NEXT: addi sp, sp, -2032
315-
; RV32-NEXT: .cfi_def_cfa_offset 2032
316-
; RV32-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
317-
; RV32-NEXT: .cfi_offset ra, -4
318-
; RV32-NEXT: lui a0, 4
319-
; RV32-NEXT: addi a0, a0, -80
320-
; RV32-NEXT: sub sp, sp, a0
321-
; RV32-NEXT: .cfi_def_cfa_offset 18336
322-
; RV32-NEXT: addi a0, sp, 12
323-
; RV32-NEXT: call callee
324-
; RV32-NEXT: lui a0, 4
325-
; RV32-NEXT: addi a0, a0, -80
326-
; RV32-NEXT: add sp, sp, a0
327-
; RV32-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
328-
; RV32-NEXT: addi sp, sp, 2032
329-
; RV32-NEXT: ret
380+
; RV32I-LABEL: frame_16kb_minus_80:
381+
; RV32I: # %bb.0:
382+
; RV32I-NEXT: addi sp, sp, -2032
383+
; RV32I-NEXT: .cfi_def_cfa_offset 2032
384+
; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
385+
; RV32I-NEXT: .cfi_offset ra, -4
386+
; RV32I-NEXT: lui a0, 4
387+
; RV32I-NEXT: addi a0, a0, -80
388+
; RV32I-NEXT: sub sp, sp, a0
389+
; RV32I-NEXT: .cfi_def_cfa_offset 18336
390+
; RV32I-NEXT: addi a0, sp, 12
391+
; RV32I-NEXT: call callee
392+
; RV32I-NEXT: lui a0, 4
393+
; RV32I-NEXT: addi a0, a0, -80
394+
; RV32I-NEXT: add sp, sp, a0
395+
; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
396+
; RV32I-NEXT: addi sp, sp, 2032
397+
; RV32I-NEXT: ret
330398
;
331-
; RV64-LABEL: frame_16kb_minus_80:
332-
; RV64: # %bb.0:
333-
; RV64-NEXT: addi sp, sp, -2032
334-
; RV64-NEXT: .cfi_def_cfa_offset 2032
335-
; RV64-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
336-
; RV64-NEXT: .cfi_offset ra, -8
337-
; RV64-NEXT: lui a0, 4
338-
; RV64-NEXT: addiw a0, a0, -80
339-
; RV64-NEXT: sub sp, sp, a0
340-
; RV64-NEXT: .cfi_def_cfa_offset 18336
341-
; RV64-NEXT: addi a0, sp, 8
342-
; RV64-NEXT: call callee
343-
; RV64-NEXT: lui a0, 4
344-
; RV64-NEXT: addiw a0, a0, -80
345-
; RV64-NEXT: add sp, sp, a0
346-
; RV64-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
347-
; RV64-NEXT: addi sp, sp, 2032
348-
; RV64-NEXT: ret
399+
; RV32ZBA-LABEL: frame_16kb_minus_80:
400+
; RV32ZBA: # %bb.0:
401+
; RV32ZBA-NEXT: addi sp, sp, -2032
402+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 2032
403+
; RV32ZBA-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
404+
; RV32ZBA-NEXT: .cfi_offset ra, -4
405+
; RV32ZBA-NEXT: li a0, -2038
406+
; RV32ZBA-NEXT: sh3add sp, a0, sp
407+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 18336
408+
; RV32ZBA-NEXT: addi a0, sp, 12
409+
; RV32ZBA-NEXT: call callee
410+
; RV32ZBA-NEXT: li a0, 2038
411+
; RV32ZBA-NEXT: sh3add sp, a0, sp
412+
; RV32ZBA-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
413+
; RV32ZBA-NEXT: addi sp, sp, 2032
414+
; RV32ZBA-NEXT: ret
415+
;
416+
; RV64I-LABEL: frame_16kb_minus_80:
417+
; RV64I: # %bb.0:
418+
; RV64I-NEXT: addi sp, sp, -2032
419+
; RV64I-NEXT: .cfi_def_cfa_offset 2032
420+
; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
421+
; RV64I-NEXT: .cfi_offset ra, -8
422+
; RV64I-NEXT: lui a0, 4
423+
; RV64I-NEXT: addiw a0, a0, -80
424+
; RV64I-NEXT: sub sp, sp, a0
425+
; RV64I-NEXT: .cfi_def_cfa_offset 18336
426+
; RV64I-NEXT: addi a0, sp, 8
427+
; RV64I-NEXT: call callee
428+
; RV64I-NEXT: lui a0, 4
429+
; RV64I-NEXT: addiw a0, a0, -80
430+
; RV64I-NEXT: add sp, sp, a0
431+
; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
432+
; RV64I-NEXT: addi sp, sp, 2032
433+
; RV64I-NEXT: ret
434+
;
435+
; RV64ZBA-LABEL: frame_16kb_minus_80:
436+
; RV64ZBA: # %bb.0:
437+
; RV64ZBA-NEXT: addi sp, sp, -2032
438+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 2032
439+
; RV64ZBA-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
440+
; RV64ZBA-NEXT: .cfi_offset ra, -8
441+
; RV64ZBA-NEXT: li a0, -2038
442+
; RV64ZBA-NEXT: sh3add sp, a0, sp
443+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 18336
444+
; RV64ZBA-NEXT: addi a0, sp, 8
445+
; RV64ZBA-NEXT: call callee
446+
; RV64ZBA-NEXT: li a0, 2038
447+
; RV64ZBA-NEXT: sh3add sp, a0, sp
448+
; RV64ZBA-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
449+
; RV64ZBA-NEXT: addi sp, sp, 2032
450+
; RV64ZBA-NEXT: ret
349451
%a = alloca [18320 x i8]
350452
call void @callee(ptr %a)
351453
ret void
@@ -430,8 +532,3 @@ define void @frame_32kb() {
430532
call void @callee(ptr %a)
431533
ret void
432534
}
433-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
434-
; RV32I: {{.*}}
435-
; RV32ZBA: {{.*}}
436-
; RV64I: {{.*}}
437-
; RV64ZBA: {{.*}}

0 commit comments

Comments
 (0)