Skip to content

Commit 06246b2

Browse files
committed
[RISCV] Add shrinkwrap test cases showing gaps in current impl
This covers multiple interactions reduced from larger workloads: 1) Rematerializing addi s0, x0, <imm> to avoid the need to spill a CSR, with the common user being a branch. (i.e. branch on immediate idioms) 2) Rematerializing addi s0, a0, <imm> to avoid the need to spill a CSR, with the common user being a vector load or store. (i.e. because we don't have (reg+imm) addressing on vector.) 3) Independent of the previous, we could still shrink wrap these by locally using a non-CSR, and deferring the move into csr into the non-shrink wrapped path. 4) Weirdly, MachineCSE is producing a different result when an edge is manually split. This edge split should be irrelevant to the CSE?
1 parent 82b4379 commit 06246b2

File tree

2 files changed

+629
-0
lines changed

2 files changed

+629
-0
lines changed
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=riscv32 -mattr=+m,+v < %s | FileCheck --check-prefix=RV32 %s
3+
; RUN: llc -mtriple=riscv64 -mattr=+m,+v < %s | FileCheck --check-prefix=RV64 %s
4+
5+
; FIXME: We can rematerialize "addi s0, a2, 32" (ideally along the edge
6+
; %do.call -> %exit), and shrink wrap this routine
7+
define void @vecaddr_straightline(i32 zeroext %a, ptr %p) {
8+
; RV32-LABEL: vecaddr_straightline:
9+
; RV32: # %bb.0:
10+
; RV32-NEXT: addi sp, sp, -16
11+
; RV32-NEXT: .cfi_def_cfa_offset 16
12+
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
13+
; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
14+
; RV32-NEXT: .cfi_offset ra, -4
15+
; RV32-NEXT: .cfi_offset s0, -8
16+
; RV32-NEXT: addi s0, a1, 32
17+
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
18+
; RV32-NEXT: vle32.v v8, (s0)
19+
; RV32-NEXT: vadd.vi v8, v8, 1
20+
; RV32-NEXT: li a1, 57
21+
; RV32-NEXT: vse32.v v8, (s0)
22+
; RV32-NEXT: beq a0, a1, .LBB0_2
23+
; RV32-NEXT: # %bb.1: # %do_call
24+
; RV32-NEXT: call foo
25+
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
26+
; RV32-NEXT: .LBB0_2: # %exit
27+
; RV32-NEXT: vle32.v v8, (s0)
28+
; RV32-NEXT: vadd.vi v8, v8, 1
29+
; RV32-NEXT: vse32.v v8, (s0)
30+
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
31+
; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
32+
; RV32-NEXT: .cfi_restore ra
33+
; RV32-NEXT: .cfi_restore s0
34+
; RV32-NEXT: addi sp, sp, 16
35+
; RV32-NEXT: .cfi_def_cfa_offset 0
36+
; RV32-NEXT: ret
37+
;
38+
; RV64-LABEL: vecaddr_straightline:
39+
; RV64: # %bb.0:
40+
; RV64-NEXT: addi sp, sp, -16
41+
; RV64-NEXT: .cfi_def_cfa_offset 16
42+
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
43+
; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
44+
; RV64-NEXT: .cfi_offset ra, -8
45+
; RV64-NEXT: .cfi_offset s0, -16
46+
; RV64-NEXT: addi s0, a1, 32
47+
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
48+
; RV64-NEXT: vle32.v v8, (s0)
49+
; RV64-NEXT: vadd.vi v8, v8, 1
50+
; RV64-NEXT: li a1, 57
51+
; RV64-NEXT: vse32.v v8, (s0)
52+
; RV64-NEXT: beq a0, a1, .LBB0_2
53+
; RV64-NEXT: # %bb.1: # %do_call
54+
; RV64-NEXT: call foo
55+
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
56+
; RV64-NEXT: .LBB0_2: # %exit
57+
; RV64-NEXT: vle32.v v8, (s0)
58+
; RV64-NEXT: vadd.vi v8, v8, 1
59+
; RV64-NEXT: vse32.v v8, (s0)
60+
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
61+
; RV64-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
62+
; RV64-NEXT: .cfi_restore ra
63+
; RV64-NEXT: .cfi_restore s0
64+
; RV64-NEXT: addi sp, sp, 16
65+
; RV64-NEXT: .cfi_def_cfa_offset 0
66+
; RV64-NEXT: ret
67+
%gep = getelementptr i8, ptr %p, i32 32
68+
%v1 = load <4 x i32>, ptr %gep
69+
%v2 = add <4 x i32> %v1, splat (i32 1)
70+
store <4 x i32> %v2, ptr %gep
71+
%cmp0 = icmp eq i32 %a, 57
72+
br i1 %cmp0, label %exit, label %do_call
73+
do_call:
74+
call i32 @foo()
75+
br label %exit
76+
exit:
77+
%v3 = load <4 x i32>, ptr %gep
78+
%v4 = add <4 x i32> %v3, splat (i32 1)
79+
store <4 x i32> %v4, ptr %gep
80+
ret void
81+
}
82+
83+
; In this case, the second use is in a loop, so using a callee
84+
; saved register to avoid a remat is the profitable choice.
85+
; FIXME: We can shrink wrap the frame setup around the loop
86+
; and avoid it along the %bb.0 -> %exit edge
87+
define void @vecaddr_loop(i32 zeroext %a, ptr %p) {
88+
; RV32-LABEL: vecaddr_loop:
89+
; RV32: # %bb.0:
90+
; RV32-NEXT: addi sp, sp, -16
91+
; RV32-NEXT: .cfi_def_cfa_offset 16
92+
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
93+
; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
94+
; RV32-NEXT: .cfi_offset ra, -4
95+
; RV32-NEXT: .cfi_offset s0, -8
96+
; RV32-NEXT: addi s0, a1, 32
97+
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
98+
; RV32-NEXT: vle32.v v8, (s0)
99+
; RV32-NEXT: vadd.vi v8, v8, 1
100+
; RV32-NEXT: li a1, 57
101+
; RV32-NEXT: vse32.v v8, (s0)
102+
; RV32-NEXT: beq a0, a1, .LBB1_2
103+
; RV32-NEXT: .LBB1_1: # %do_call
104+
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
105+
; RV32-NEXT: call foo
106+
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
107+
; RV32-NEXT: vle32.v v8, (s0)
108+
; RV32-NEXT: vadd.vi v8, v8, 1
109+
; RV32-NEXT: vse32.v v8, (s0)
110+
; RV32-NEXT: bnez a0, .LBB1_1
111+
; RV32-NEXT: .LBB1_2: # %exit
112+
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
113+
; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
114+
; RV32-NEXT: .cfi_restore ra
115+
; RV32-NEXT: .cfi_restore s0
116+
; RV32-NEXT: addi sp, sp, 16
117+
; RV32-NEXT: .cfi_def_cfa_offset 0
118+
; RV32-NEXT: ret
119+
;
120+
; RV64-LABEL: vecaddr_loop:
121+
; RV64: # %bb.0:
122+
; RV64-NEXT: addi sp, sp, -16
123+
; RV64-NEXT: .cfi_def_cfa_offset 16
124+
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
125+
; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
126+
; RV64-NEXT: .cfi_offset ra, -8
127+
; RV64-NEXT: .cfi_offset s0, -16
128+
; RV64-NEXT: addi s0, a1, 32
129+
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
130+
; RV64-NEXT: vle32.v v8, (s0)
131+
; RV64-NEXT: vadd.vi v8, v8, 1
132+
; RV64-NEXT: li a1, 57
133+
; RV64-NEXT: vse32.v v8, (s0)
134+
; RV64-NEXT: beq a0, a1, .LBB1_2
135+
; RV64-NEXT: .LBB1_1: # %do_call
136+
; RV64-NEXT: # =>This Inner Loop Header: Depth=1
137+
; RV64-NEXT: call foo
138+
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
139+
; RV64-NEXT: vle32.v v8, (s0)
140+
; RV64-NEXT: vadd.vi v8, v8, 1
141+
; RV64-NEXT: vse32.v v8, (s0)
142+
; RV64-NEXT: bnez a0, .LBB1_1
143+
; RV64-NEXT: .LBB1_2: # %exit
144+
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
145+
; RV64-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
146+
; RV64-NEXT: .cfi_restore ra
147+
; RV64-NEXT: .cfi_restore s0
148+
; RV64-NEXT: addi sp, sp, 16
149+
; RV64-NEXT: .cfi_def_cfa_offset 0
150+
; RV64-NEXT: ret
151+
%gep = getelementptr i8, ptr %p, i32 32
152+
%v1 = load <4 x i32>, ptr %gep
153+
%v2 = add <4 x i32> %v1, splat (i32 1)
154+
store <4 x i32> %v2, ptr %gep
155+
%cmp0 = icmp eq i32 %a, 57
156+
br i1 %cmp0, label %exit, label %do_call
157+
do_call:
158+
%b = call i32 @foo()
159+
%v3 = load <4 x i32>, ptr %gep
160+
%v4 = add <4 x i32> %v3, splat (i32 1)
161+
store <4 x i32> %v4, ptr %gep
162+
163+
%cmp1 = icmp eq i32 %b, 0
164+
br i1 %cmp1, label %exit, label %do_call
165+
exit:
166+
ret void
167+
}
168+
169+
declare zeroext i32 @foo()
170+

0 commit comments

Comments
 (0)