@@ -145,49 +145,29 @@ entry:
145
145
146
146
; Test skipping the lower-32-bit addition if it is unnecessary.
147
147
define ptr @huge_offset_low_32_unused (ptr %p ) {
148
- ; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
149
- ; GFX942_PTRADD: ; %bb.0:
150
- ; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151
- ; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0
152
- ; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1
153
- ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
154
- ; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
155
- ;
156
- ; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
157
- ; GFX942_LEGACY: ; %bb.0:
158
- ; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159
- ; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1
160
- ; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
148
+ ; GFX942-LABEL: huge_offset_low_32_unused:
149
+ ; GFX942: ; %bb.0:
150
+ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151
+ ; GFX942-NEXT: v_add_u32_e32 v1, 1, v1
152
+ ; GFX942-NEXT: s_setpc_b64 s[30:31]
161
153
%gep = getelementptr inbounds i8 , ptr %p , i64 u0x100000000
162
154
ret ptr %gep
163
155
}
164
156
165
157
; Reassociate address computation if it leads to more scalar operations.
166
158
define amdgpu_kernel void @reassoc_scalar_r (ptr addrspace (1 ) %out , ptr addrspace (1 ) %p , i64 %soffset ) {
167
- ; GFX942_PTRADD-LABEL: reassoc_scalar_r:
168
- ; GFX942_PTRADD: ; %bb.0: ; %entry
169
- ; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
170
- ; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
171
- ; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
172
- ; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
173
- ; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
174
- ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
175
- ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
176
- ; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
177
- ; GFX942_PTRADD-NEXT: s_endpgm
178
- ;
179
- ; GFX942_LEGACY-LABEL: reassoc_scalar_r:
180
- ; GFX942_LEGACY: ; %bb.0: ; %entry
181
- ; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
182
- ; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
183
- ; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
184
- ; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
185
- ; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
186
- ; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
187
- ; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
188
- ; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
189
- ; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
190
- ; GFX942_LEGACY-NEXT: s_endpgm
159
+ ; GFX942-LABEL: reassoc_scalar_r:
160
+ ; GFX942: ; %bb.0: ; %entry
161
+ ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
162
+ ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
163
+ ; GFX942-NEXT: v_mov_b32_e32 v1, 0
164
+ ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
165
+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
166
+ ; GFX942-NEXT: s_add_u32 s2, s2, s6
167
+ ; GFX942-NEXT: s_addc_u32 s3, s3, s7
168
+ ; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
169
+ ; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
170
+ ; GFX942-NEXT: s_endpgm
191
171
entry:
192
172
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x ()
193
173
%voffset = zext i32 %voffset32 to i64
@@ -198,30 +178,18 @@ entry:
198
178
}
199
179
200
180
define amdgpu_kernel void @reassoc_scalar_l (ptr addrspace (1 ) %out , ptr addrspace (1 ) %p , i64 %soffset ) {
201
- ; GFX942_PTRADD-LABEL: reassoc_scalar_l:
202
- ; GFX942_PTRADD: ; %bb.0: ; %entry
203
- ; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
204
- ; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
205
- ; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
206
- ; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
207
- ; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
208
- ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
209
- ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
210
- ; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
211
- ; GFX942_PTRADD-NEXT: s_endpgm
212
- ;
213
- ; GFX942_LEGACY-LABEL: reassoc_scalar_l:
214
- ; GFX942_LEGACY: ; %bb.0: ; %entry
215
- ; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
216
- ; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
217
- ; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
218
- ; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
219
- ; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
220
- ; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
221
- ; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
222
- ; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
223
- ; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
224
- ; GFX942_LEGACY-NEXT: s_endpgm
181
+ ; GFX942-LABEL: reassoc_scalar_l:
182
+ ; GFX942: ; %bb.0: ; %entry
183
+ ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
184
+ ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
185
+ ; GFX942-NEXT: v_mov_b32_e32 v1, 0
186
+ ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
187
+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
188
+ ; GFX942-NEXT: s_add_u32 s2, s2, s6
189
+ ; GFX942-NEXT: s_addc_u32 s3, s3, s7
190
+ ; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
191
+ ; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
192
+ ; GFX942-NEXT: s_endpgm
225
193
entry:
226
194
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x ()
227
195
%voffset = zext i32 %voffset32 to i64
@@ -233,24 +201,14 @@ entry:
233
201
234
202
; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
235
203
define ptr addrspace (1 ) @shl_neg_offset (ptr addrspace (1 ) %p , i64 %noffset , i64 %shift ) {
236
- ; GFX942_PTRADD-LABEL: shl_neg_offset:
237
- ; GFX942_PTRADD: ; %bb.0:
238
- ; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239
- ; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2
240
- ; GFX942_PTRADD-NEXT: s_nop 1
241
- ; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
242
- ; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
243
- ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
244
- ; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
245
- ;
246
- ; GFX942_LEGACY-LABEL: shl_neg_offset:
247
- ; GFX942_LEGACY: ; %bb.0:
248
- ; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249
- ; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
250
- ; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
251
- ; GFX942_LEGACY-NEXT: s_nop 1
252
- ; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
253
- ; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
204
+ ; GFX942-LABEL: shl_neg_offset:
205
+ ; GFX942: ; %bb.0:
206
+ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207
+ ; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
208
+ ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
209
+ ; GFX942-NEXT: s_nop 1
210
+ ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
211
+ ; GFX942-NEXT: s_setpc_b64 s[30:31]
254
212
%offset = sub i64 0 , %noffset
255
213
%x = shl i64 %offset , %shift
256
214
%gep = getelementptr inbounds i8 , ptr addrspace (1 ) %p , i64 %x
@@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
268
226
; GFX942_PTRADD: ; %bb.0:
269
227
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270
228
; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
271
- ; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4
272
- ; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12
229
+ ; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
230
+ ; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
273
231
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
274
- ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10
275
232
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
276
233
;
277
234
; GFX942_LEGACY-LABEL: complextype_global_gep:
@@ -291,27 +248,15 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
291
248
292
249
; Tests the tryFoldToMad64_32 PTRADD combine.
293
250
define amdgpu_kernel void @fold_mad64 (ptr addrspace (1 ) %p ) {
294
- ; GFX942_PTRADD-LABEL: fold_mad64:
295
- ; GFX942_PTRADD: ; %bb.0:
296
- ; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
297
- ; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
298
- ; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0
299
- ; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0
300
- ; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0
301
- ; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
302
- ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
303
- ; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off
304
- ; GFX942_PTRADD-NEXT: s_endpgm
305
- ;
306
- ; GFX942_LEGACY-LABEL: fold_mad64:
307
- ; GFX942_LEGACY: ; %bb.0:
308
- ; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
309
- ; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
310
- ; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0
311
- ; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
312
- ; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
313
- ; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off
314
- ; GFX942_LEGACY-NEXT: s_endpgm
251
+ ; GFX942-LABEL: fold_mad64:
252
+ ; GFX942: ; %bb.0:
253
+ ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
254
+ ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
255
+ ; GFX942-NEXT: v_mov_b32_e32 v2, 1.0
256
+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
257
+ ; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
258
+ ; GFX942-NEXT: global_store_dword v[0:1], v2, off
259
+ ; GFX942-NEXT: s_endpgm
315
260
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x ()
316
261
%voffset = zext i32 %voffset32 to i64
317
262
%p1 = getelementptr inbounds %S , ptr addrspace (1 ) %p , i64 %voffset , i32 0
0 commit comments