Skip to content

Commit eb26edb

Browse files
authored
[RISCV] Exploit sh3add/sh2add for stack offsets by shifted 12-bit constants (#87950)
If we're falling back to generic constant formation in a register + add/sub, we can check if we have a constant which is 12-bits but left shifted by 2 or 3. If so, we can use a sh2add or sh3add to perform the shift and add in a single instruction. This is profitable when the unshifted constant would require two instructions (LUI/ADDI) to form, but is never harmful since we're going to need at least two instructions regardless of the constant value. Since stacks are aligned to 16 bytes by default, sh3add allows addresing (aligned) data out to 2^14 (i.e. 16kb) in at most two instructions w/zba.
1 parent f5cf98c commit eb26edb

File tree

3 files changed

+481
-242
lines changed

3 files changed

+481
-242
lines changed

llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,31 @@ void RISCVRegisterInfo::adjustReg(MachineBasicBlock &MBB,
248248
return;
249249
}
250250

251+
// Use shNadd if doing so lets us materialize a 12 bit immediate with a single
252+
// instruction. This saves 1 instruction over the full lui/addi+add fallback
253+
// path. We avoid anything which can be done with a single lui as it might
254+
// be compressible. Note that the sh1add case is fully covered by the 2x addi
255+
// case just above and is thus ommitted.
256+
if (ST.hasStdExtZba() && (Val & 0xFFF) != 0) {
257+
unsigned Opc = 0;
258+
if (isShiftedInt<12, 3>(Val)) {
259+
Opc = RISCV::SH3ADD;
260+
Val = Val >> 3;
261+
} else if (isShiftedInt<12, 2>(Val)) {
262+
Opc = RISCV::SH2ADD;
263+
Val = Val >> 2;
264+
}
265+
if (Opc) {
266+
Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
267+
TII->movImm(MBB, II, DL, ScratchReg, Val, Flag);
268+
BuildMI(MBB, II, DL, TII->get(Opc), DestReg)
269+
.addReg(ScratchReg, RegState::Kill)
270+
.addReg(SrcReg, getKillRegState(KillSrcReg))
271+
.setMIFlag(Flag);
272+
return;
273+
}
274+
}
275+
251276
unsigned Opc = RISCV::ADD;
252277
if (Val < 0) {
253278
Val = -Val;

llvm/test/CodeGen/RISCV/prolog-epilogue.ll

Lines changed: 210 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -182,43 +182,77 @@ define void @frame_4kb() {
182182
}
183183

184184
define void @frame_4kb_offset_128() {
185-
; RV32-LABEL: frame_4kb_offset_128:
186-
; RV32: # %bb.0:
187-
; RV32-NEXT: addi sp, sp, -2032
188-
; RV32-NEXT: .cfi_def_cfa_offset 2032
189-
; RV32-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
190-
; RV32-NEXT: .cfi_offset ra, -4
191-
; RV32-NEXT: lui a0, 1
192-
; RV32-NEXT: addi a0, a0, 128
193-
; RV32-NEXT: sub sp, sp, a0
194-
; RV32-NEXT: .cfi_def_cfa_offset 6256
195-
; RV32-NEXT: addi a0, sp, 12
196-
; RV32-NEXT: call callee
197-
; RV32-NEXT: lui a0, 1
198-
; RV32-NEXT: addi a0, a0, 128
199-
; RV32-NEXT: add sp, sp, a0
200-
; RV32-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
201-
; RV32-NEXT: addi sp, sp, 2032
202-
; RV32-NEXT: ret
185+
; RV32I-LABEL: frame_4kb_offset_128:
186+
; RV32I: # %bb.0:
187+
; RV32I-NEXT: addi sp, sp, -2032
188+
; RV32I-NEXT: .cfi_def_cfa_offset 2032
189+
; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
190+
; RV32I-NEXT: .cfi_offset ra, -4
191+
; RV32I-NEXT: lui a0, 1
192+
; RV32I-NEXT: addi a0, a0, 128
193+
; RV32I-NEXT: sub sp, sp, a0
194+
; RV32I-NEXT: .cfi_def_cfa_offset 6256
195+
; RV32I-NEXT: addi a0, sp, 12
196+
; RV32I-NEXT: call callee
197+
; RV32I-NEXT: lui a0, 1
198+
; RV32I-NEXT: addi a0, a0, 128
199+
; RV32I-NEXT: add sp, sp, a0
200+
; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
201+
; RV32I-NEXT: addi sp, sp, 2032
202+
; RV32I-NEXT: ret
203203
;
204-
; RV64-LABEL: frame_4kb_offset_128:
205-
; RV64: # %bb.0:
206-
; RV64-NEXT: addi sp, sp, -2032
207-
; RV64-NEXT: .cfi_def_cfa_offset 2032
208-
; RV64-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
209-
; RV64-NEXT: .cfi_offset ra, -8
210-
; RV64-NEXT: lui a0, 1
211-
; RV64-NEXT: addiw a0, a0, 128
212-
; RV64-NEXT: sub sp, sp, a0
213-
; RV64-NEXT: .cfi_def_cfa_offset 6256
214-
; RV64-NEXT: addi a0, sp, 8
215-
; RV64-NEXT: call callee
216-
; RV64-NEXT: lui a0, 1
217-
; RV64-NEXT: addiw a0, a0, 128
218-
; RV64-NEXT: add sp, sp, a0
219-
; RV64-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
220-
; RV64-NEXT: addi sp, sp, 2032
221-
; RV64-NEXT: ret
204+
; RV32ZBA-LABEL: frame_4kb_offset_128:
205+
; RV32ZBA: # %bb.0:
206+
; RV32ZBA-NEXT: addi sp, sp, -2032
207+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 2032
208+
; RV32ZBA-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
209+
; RV32ZBA-NEXT: .cfi_offset ra, -4
210+
; RV32ZBA-NEXT: li a0, -528
211+
; RV32ZBA-NEXT: sh3add sp, a0, sp
212+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 6256
213+
; RV32ZBA-NEXT: addi a0, sp, 12
214+
; RV32ZBA-NEXT: call callee
215+
; RV32ZBA-NEXT: li a0, 528
216+
; RV32ZBA-NEXT: sh3add sp, a0, sp
217+
; RV32ZBA-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
218+
; RV32ZBA-NEXT: addi sp, sp, 2032
219+
; RV32ZBA-NEXT: ret
220+
;
221+
; RV64I-LABEL: frame_4kb_offset_128:
222+
; RV64I: # %bb.0:
223+
; RV64I-NEXT: addi sp, sp, -2032
224+
; RV64I-NEXT: .cfi_def_cfa_offset 2032
225+
; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
226+
; RV64I-NEXT: .cfi_offset ra, -8
227+
; RV64I-NEXT: lui a0, 1
228+
; RV64I-NEXT: addiw a0, a0, 128
229+
; RV64I-NEXT: sub sp, sp, a0
230+
; RV64I-NEXT: .cfi_def_cfa_offset 6256
231+
; RV64I-NEXT: addi a0, sp, 8
232+
; RV64I-NEXT: call callee
233+
; RV64I-NEXT: lui a0, 1
234+
; RV64I-NEXT: addiw a0, a0, 128
235+
; RV64I-NEXT: add sp, sp, a0
236+
; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
237+
; RV64I-NEXT: addi sp, sp, 2032
238+
; RV64I-NEXT: ret
239+
;
240+
; RV64ZBA-LABEL: frame_4kb_offset_128:
241+
; RV64ZBA: # %bb.0:
242+
; RV64ZBA-NEXT: addi sp, sp, -2032
243+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 2032
244+
; RV64ZBA-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
245+
; RV64ZBA-NEXT: .cfi_offset ra, -8
246+
; RV64ZBA-NEXT: li a0, -528
247+
; RV64ZBA-NEXT: sh3add sp, a0, sp
248+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 6256
249+
; RV64ZBA-NEXT: addi a0, sp, 8
250+
; RV64ZBA-NEXT: call callee
251+
; RV64ZBA-NEXT: li a0, 528
252+
; RV64ZBA-NEXT: sh3add sp, a0, sp
253+
; RV64ZBA-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
254+
; RV64ZBA-NEXT: addi sp, sp, 2032
255+
; RV64ZBA-NEXT: ret
222256
%a = alloca [6240 x i8]
223257
call void @callee(ptr %a)
224258
ret void
@@ -266,86 +300,154 @@ define void @frame_8kb() {
266300
}
267301

268302
define void @frame_8kb_offset_128() {
269-
; RV32-LABEL: frame_8kb_offset_128:
270-
; RV32: # %bb.0:
271-
; RV32-NEXT: addi sp, sp, -2032
272-
; RV32-NEXT: .cfi_def_cfa_offset 2032
273-
; RV32-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
274-
; RV32-NEXT: .cfi_offset ra, -4
275-
; RV32-NEXT: lui a0, 2
276-
; RV32-NEXT: addi a0, a0, 128
277-
; RV32-NEXT: sub sp, sp, a0
278-
; RV32-NEXT: .cfi_def_cfa_offset 10352
279-
; RV32-NEXT: addi a0, sp, 12
280-
; RV32-NEXT: call callee
281-
; RV32-NEXT: lui a0, 2
282-
; RV32-NEXT: addi a0, a0, 128
283-
; RV32-NEXT: add sp, sp, a0
284-
; RV32-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
285-
; RV32-NEXT: addi sp, sp, 2032
286-
; RV32-NEXT: ret
303+
; RV32I-LABEL: frame_8kb_offset_128:
304+
; RV32I: # %bb.0:
305+
; RV32I-NEXT: addi sp, sp, -2032
306+
; RV32I-NEXT: .cfi_def_cfa_offset 2032
307+
; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
308+
; RV32I-NEXT: .cfi_offset ra, -4
309+
; RV32I-NEXT: lui a0, 2
310+
; RV32I-NEXT: addi a0, a0, 128
311+
; RV32I-NEXT: sub sp, sp, a0
312+
; RV32I-NEXT: .cfi_def_cfa_offset 10352
313+
; RV32I-NEXT: addi a0, sp, 12
314+
; RV32I-NEXT: call callee
315+
; RV32I-NEXT: lui a0, 2
316+
; RV32I-NEXT: addi a0, a0, 128
317+
; RV32I-NEXT: add sp, sp, a0
318+
; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
319+
; RV32I-NEXT: addi sp, sp, 2032
320+
; RV32I-NEXT: ret
287321
;
288-
; RV64-LABEL: frame_8kb_offset_128:
289-
; RV64: # %bb.0:
290-
; RV64-NEXT: addi sp, sp, -2032
291-
; RV64-NEXT: .cfi_def_cfa_offset 2032
292-
; RV64-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
293-
; RV64-NEXT: .cfi_offset ra, -8
294-
; RV64-NEXT: lui a0, 2
295-
; RV64-NEXT: addiw a0, a0, 128
296-
; RV64-NEXT: sub sp, sp, a0
297-
; RV64-NEXT: .cfi_def_cfa_offset 10352
298-
; RV64-NEXT: addi a0, sp, 8
299-
; RV64-NEXT: call callee
300-
; RV64-NEXT: lui a0, 2
301-
; RV64-NEXT: addiw a0, a0, 128
302-
; RV64-NEXT: add sp, sp, a0
303-
; RV64-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
304-
; RV64-NEXT: addi sp, sp, 2032
305-
; RV64-NEXT: ret
322+
; RV32ZBA-LABEL: frame_8kb_offset_128:
323+
; RV32ZBA: # %bb.0:
324+
; RV32ZBA-NEXT: addi sp, sp, -2032
325+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 2032
326+
; RV32ZBA-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
327+
; RV32ZBA-NEXT: .cfi_offset ra, -4
328+
; RV32ZBA-NEXT: li a0, -1040
329+
; RV32ZBA-NEXT: sh3add sp, a0, sp
330+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 10352
331+
; RV32ZBA-NEXT: addi a0, sp, 12
332+
; RV32ZBA-NEXT: call callee
333+
; RV32ZBA-NEXT: li a0, 1040
334+
; RV32ZBA-NEXT: sh3add sp, a0, sp
335+
; RV32ZBA-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
336+
; RV32ZBA-NEXT: addi sp, sp, 2032
337+
; RV32ZBA-NEXT: ret
338+
;
339+
; RV64I-LABEL: frame_8kb_offset_128:
340+
; RV64I: # %bb.0:
341+
; RV64I-NEXT: addi sp, sp, -2032
342+
; RV64I-NEXT: .cfi_def_cfa_offset 2032
343+
; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
344+
; RV64I-NEXT: .cfi_offset ra, -8
345+
; RV64I-NEXT: lui a0, 2
346+
; RV64I-NEXT: addiw a0, a0, 128
347+
; RV64I-NEXT: sub sp, sp, a0
348+
; RV64I-NEXT: .cfi_def_cfa_offset 10352
349+
; RV64I-NEXT: addi a0, sp, 8
350+
; RV64I-NEXT: call callee
351+
; RV64I-NEXT: lui a0, 2
352+
; RV64I-NEXT: addiw a0, a0, 128
353+
; RV64I-NEXT: add sp, sp, a0
354+
; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
355+
; RV64I-NEXT: addi sp, sp, 2032
356+
; RV64I-NEXT: ret
357+
;
358+
; RV64ZBA-LABEL: frame_8kb_offset_128:
359+
; RV64ZBA: # %bb.0:
360+
; RV64ZBA-NEXT: addi sp, sp, -2032
361+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 2032
362+
; RV64ZBA-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
363+
; RV64ZBA-NEXT: .cfi_offset ra, -8
364+
; RV64ZBA-NEXT: li a0, -1040
365+
; RV64ZBA-NEXT: sh3add sp, a0, sp
366+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 10352
367+
; RV64ZBA-NEXT: addi a0, sp, 8
368+
; RV64ZBA-NEXT: call callee
369+
; RV64ZBA-NEXT: li a0, 1040
370+
; RV64ZBA-NEXT: sh3add sp, a0, sp
371+
; RV64ZBA-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
372+
; RV64ZBA-NEXT: addi sp, sp, 2032
373+
; RV64ZBA-NEXT: ret
306374
%a = alloca [10336 x i8]
307375
call void @callee(ptr %a)
308376
ret void
309377
}
310378

311379
define void @frame_16kb_minus_80() {
312-
; RV32-LABEL: frame_16kb_minus_80:
313-
; RV32: # %bb.0:
314-
; RV32-NEXT: addi sp, sp, -2032
315-
; RV32-NEXT: .cfi_def_cfa_offset 2032
316-
; RV32-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
317-
; RV32-NEXT: .cfi_offset ra, -4
318-
; RV32-NEXT: lui a0, 4
319-
; RV32-NEXT: addi a0, a0, -80
320-
; RV32-NEXT: sub sp, sp, a0
321-
; RV32-NEXT: .cfi_def_cfa_offset 18336
322-
; RV32-NEXT: addi a0, sp, 12
323-
; RV32-NEXT: call callee
324-
; RV32-NEXT: lui a0, 4
325-
; RV32-NEXT: addi a0, a0, -80
326-
; RV32-NEXT: add sp, sp, a0
327-
; RV32-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
328-
; RV32-NEXT: addi sp, sp, 2032
329-
; RV32-NEXT: ret
380+
; RV32I-LABEL: frame_16kb_minus_80:
381+
; RV32I: # %bb.0:
382+
; RV32I-NEXT: addi sp, sp, -2032
383+
; RV32I-NEXT: .cfi_def_cfa_offset 2032
384+
; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
385+
; RV32I-NEXT: .cfi_offset ra, -4
386+
; RV32I-NEXT: lui a0, 4
387+
; RV32I-NEXT: addi a0, a0, -80
388+
; RV32I-NEXT: sub sp, sp, a0
389+
; RV32I-NEXT: .cfi_def_cfa_offset 18336
390+
; RV32I-NEXT: addi a0, sp, 12
391+
; RV32I-NEXT: call callee
392+
; RV32I-NEXT: lui a0, 4
393+
; RV32I-NEXT: addi a0, a0, -80
394+
; RV32I-NEXT: add sp, sp, a0
395+
; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
396+
; RV32I-NEXT: addi sp, sp, 2032
397+
; RV32I-NEXT: ret
330398
;
331-
; RV64-LABEL: frame_16kb_minus_80:
332-
; RV64: # %bb.0:
333-
; RV64-NEXT: addi sp, sp, -2032
334-
; RV64-NEXT: .cfi_def_cfa_offset 2032
335-
; RV64-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
336-
; RV64-NEXT: .cfi_offset ra, -8
337-
; RV64-NEXT: lui a0, 4
338-
; RV64-NEXT: addiw a0, a0, -80
339-
; RV64-NEXT: sub sp, sp, a0
340-
; RV64-NEXT: .cfi_def_cfa_offset 18336
341-
; RV64-NEXT: addi a0, sp, 8
342-
; RV64-NEXT: call callee
343-
; RV64-NEXT: lui a0, 4
344-
; RV64-NEXT: addiw a0, a0, -80
345-
; RV64-NEXT: add sp, sp, a0
346-
; RV64-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
347-
; RV64-NEXT: addi sp, sp, 2032
348-
; RV64-NEXT: ret
399+
; RV32ZBA-LABEL: frame_16kb_minus_80:
400+
; RV32ZBA: # %bb.0:
401+
; RV32ZBA-NEXT: addi sp, sp, -2032
402+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 2032
403+
; RV32ZBA-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
404+
; RV32ZBA-NEXT: .cfi_offset ra, -4
405+
; RV32ZBA-NEXT: li a0, -2038
406+
; RV32ZBA-NEXT: sh3add sp, a0, sp
407+
; RV32ZBA-NEXT: .cfi_def_cfa_offset 18336
408+
; RV32ZBA-NEXT: addi a0, sp, 12
409+
; RV32ZBA-NEXT: call callee
410+
; RV32ZBA-NEXT: li a0, 2038
411+
; RV32ZBA-NEXT: sh3add sp, a0, sp
412+
; RV32ZBA-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
413+
; RV32ZBA-NEXT: addi sp, sp, 2032
414+
; RV32ZBA-NEXT: ret
415+
;
416+
; RV64I-LABEL: frame_16kb_minus_80:
417+
; RV64I: # %bb.0:
418+
; RV64I-NEXT: addi sp, sp, -2032
419+
; RV64I-NEXT: .cfi_def_cfa_offset 2032
420+
; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
421+
; RV64I-NEXT: .cfi_offset ra, -8
422+
; RV64I-NEXT: lui a0, 4
423+
; RV64I-NEXT: addiw a0, a0, -80
424+
; RV64I-NEXT: sub sp, sp, a0
425+
; RV64I-NEXT: .cfi_def_cfa_offset 18336
426+
; RV64I-NEXT: addi a0, sp, 8
427+
; RV64I-NEXT: call callee
428+
; RV64I-NEXT: lui a0, 4
429+
; RV64I-NEXT: addiw a0, a0, -80
430+
; RV64I-NEXT: add sp, sp, a0
431+
; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
432+
; RV64I-NEXT: addi sp, sp, 2032
433+
; RV64I-NEXT: ret
434+
;
435+
; RV64ZBA-LABEL: frame_16kb_minus_80:
436+
; RV64ZBA: # %bb.0:
437+
; RV64ZBA-NEXT: addi sp, sp, -2032
438+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 2032
439+
; RV64ZBA-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
440+
; RV64ZBA-NEXT: .cfi_offset ra, -8
441+
; RV64ZBA-NEXT: li a0, -2038
442+
; RV64ZBA-NEXT: sh3add sp, a0, sp
443+
; RV64ZBA-NEXT: .cfi_def_cfa_offset 18336
444+
; RV64ZBA-NEXT: addi a0, sp, 8
445+
; RV64ZBA-NEXT: call callee
446+
; RV64ZBA-NEXT: li a0, 2038
447+
; RV64ZBA-NEXT: sh3add sp, a0, sp
448+
; RV64ZBA-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
449+
; RV64ZBA-NEXT: addi sp, sp, 2032
450+
; RV64ZBA-NEXT: ret
349451
%a = alloca [18320 x i8]
350452
call void @callee(ptr %a)
351453
ret void
@@ -430,8 +532,3 @@ define void @frame_32kb() {
430532
call void @callee(ptr %a)
431533
ret void
432534
}
433-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
434-
; RV32I: {{.*}}
435-
; RV32ZBA: {{.*}}
436-
; RV64I: {{.*}}
437-
; RV64ZBA: {{.*}}

0 commit comments

Comments
 (0)