Skip to content

Commit d363847

Browse files
committed
[AMDGPU][SDAG] Add tests for ISD::PTRADD DAG combines
Pre-committing tests to show improvements in a follow-up PR with the combines.
1 parent 63f76a4 commit d363847

File tree

1 file changed

+207
-0
lines changed

1 file changed

+207
-0
lines changed
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
4+
5+
; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
6+
; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
7+
; similar transformations in that pass.
8+
9+
; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
10+
define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
11+
; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
12+
; GFX942_PTRADD: ; %bb.0:
13+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 24
15+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
16+
; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
17+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
18+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
19+
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
20+
;
21+
; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
22+
; GFX942_LEGACY: ; %bb.0:
23+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24+
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
25+
; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
26+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
27+
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
28+
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
29+
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
30+
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
31+
%l = load i64, ptr addrspace(1) %gep1, align 8
32+
%r = add i64 %l, %voffset
33+
ret i64 %r
34+
}
35+
36+
define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
37+
; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
38+
; GFX942_PTRADD: ; %bb.0:
39+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 24
41+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
42+
; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
43+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
44+
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
45+
;
46+
; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
47+
; GFX942_LEGACY: ; %bb.0:
48+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49+
; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
50+
; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
51+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
52+
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
53+
%add0 = add nuw nsw i64 %voffset, 24
54+
%gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
55+
%l = load i64, ptr addrspace(1) %gep0, align 8
56+
ret i64 %l
57+
}
58+
59+
; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These
60+
; would be folded away in most cases, but the index computation introduced by
61+
; the legalization of wide vector stores can for example introduce them.
62+
define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
63+
; GFX942_PTRADD-LABEL: store_v16i32:
64+
; GFX942_PTRADD: ; %bb.0: ; %entry
65+
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
66+
; GFX942_PTRADD-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
67+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v4, 0
68+
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
69+
; GFX942_PTRADD-NEXT: s_add_u32 s2, s0, 32
70+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s20
71+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s21
72+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s22
73+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s23
74+
; GFX942_PTRADD-NEXT: s_addc_u32 s3, s1, 0
75+
; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
76+
; GFX942_PTRADD-NEXT: s_nop 1
77+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s16
78+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s17
79+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s18
80+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s19
81+
; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
82+
; GFX942_PTRADD-NEXT: s_nop 1
83+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s12
84+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s13
85+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s14
86+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s15
87+
; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
88+
; GFX942_PTRADD-NEXT: s_nop 1
89+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s8
90+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s9
91+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s10
92+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s11
93+
; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
94+
; GFX942_PTRADD-NEXT: s_endpgm
95+
;
96+
; GFX942_LEGACY-LABEL: store_v16i32:
97+
; GFX942_LEGACY: ; %bb.0: ; %entry
98+
; GFX942_LEGACY-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
99+
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
100+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v4, 0
101+
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
102+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s20
103+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s21
104+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s22
105+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s23
106+
; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
107+
; GFX942_LEGACY-NEXT: s_nop 1
108+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s16
109+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s17
110+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s18
111+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s19
112+
; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
113+
; GFX942_LEGACY-NEXT: s_nop 1
114+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s12
115+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s13
116+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s14
117+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s15
118+
; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
119+
; GFX942_LEGACY-NEXT: s_nop 1
120+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s8
121+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s9
122+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s10
123+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s11
124+
; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
125+
; GFX942_LEGACY-NEXT: s_endpgm
126+
entry:
127+
store <16 x i32> %a, ptr addrspace(1) %out
128+
ret void
129+
}
130+
131+
132+
; Tests the (ptradd 0, x) -> x DAG combine.
133+
define void @baseptr_null(i64 %offset, i8 %v) {
134+
; GFX942_PTRADD-LABEL: baseptr_null:
135+
; GFX942_PTRADD: ; %bb.0:
136+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137+
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], 0, 0, v[0:1]
138+
; GFX942_PTRADD-NEXT: flat_store_byte v[0:1], v2
139+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
140+
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
141+
;
142+
; GFX942_LEGACY-LABEL: baseptr_null:
143+
; GFX942_LEGACY: ; %bb.0:
144+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
145+
; GFX942_LEGACY-NEXT: flat_store_byte v[0:1], v2
146+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
147+
; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
148+
%gep = getelementptr i8, ptr null, i64 %offset
149+
store i8 %v, ptr %gep, align 1
150+
ret void
151+
}
152+
153+
; Taken from implicit-kernarg-backend-usage.ll, tests the PTRADD handling in the
154+
; assertalign DAG combine.
155+
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
156+
; GFX942_PTRADD-LABEL: llvm_amdgcn_queue_ptr:
157+
; GFX942_PTRADD: ; %bb.0:
158+
; GFX942_PTRADD-NEXT: s_add_u32 s8, s4, 8
159+
; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 0
160+
; GFX942_PTRADD-NEXT: s_addc_u32 s9, s5, 0
161+
; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1
162+
; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[8:9] sc0 sc1
163+
; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1
164+
; GFX942_PTRADD-NEXT: ; kill: killed $sgpr0_sgpr1
165+
; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
166+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
167+
; GFX942_PTRADD-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
168+
; GFX942_PTRADD-NEXT: ; kill: killed $sgpr8 killed $sgpr9
169+
; GFX942_PTRADD-NEXT: ; kill: killed $sgpr2_sgpr3
170+
; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
171+
; GFX942_PTRADD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
172+
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
173+
; GFX942_PTRADD-NEXT: s_endpgm
174+
;
175+
; GFX942_LEGACY-LABEL: llvm_amdgcn_queue_ptr:
176+
; GFX942_LEGACY: ; %bb.0:
177+
; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 0
178+
; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1
179+
; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1
180+
; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1
181+
; GFX942_LEGACY-NEXT: ; kill: killed $sgpr0_sgpr1
182+
; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
183+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
184+
; GFX942_LEGACY-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
185+
; GFX942_LEGACY-NEXT: ; kill: killed $sgpr2_sgpr3
186+
; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
187+
; GFX942_LEGACY-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
188+
; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
189+
; GFX942_LEGACY-NEXT: s_endpgm
190+
%queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
191+
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
192+
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
193+
%dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
194+
%queue.load = load volatile i8, ptr addrspace(4) %queue.ptr
195+
%implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr
196+
%dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr
197+
store volatile i64 %dispatch.id, ptr addrspace(1) %ptr
198+
ret void
199+
}
200+
201+
declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr()
202+
declare noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
203+
declare i64 @llvm.amdgcn.dispatch.id()
204+
declare noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
205+
206+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
207+
; GFX942: {{.*}}

0 commit comments

Comments
 (0)