@@ -16,7 +16,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
16
16
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
17
17
; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40
18
18
; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80
19
- ; GCN-NEXT: v_mov_b32_e32 v64, 0
20
19
; GCN-NEXT: s_waitcnt lgkmcnt(0)
21
20
; GCN-NEXT: v_mov_b32_e32 v0, s36
22
21
; GCN-NEXT: v_mov_b32_e32 v1, s37
@@ -158,10 +157,23 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
158
157
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:260
159
158
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:264
160
159
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:268
161
- ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:272
162
- ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:276
163
- ; GCN-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:280
164
- ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:284
160
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
161
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:512 ; 4-byte Folded Spill
162
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
163
+ ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:516 ; 4-byte Folded Spill
164
+ ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:520 ; 4-byte Folded Spill
165
+ ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:524 ; 4-byte Folded Spill
166
+ ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:272
167
+ ; GCN-NEXT: s_nop 0
168
+ ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:276
169
+ ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:280
170
+ ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:284
171
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
172
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:528 ; 4-byte Folded Spill
173
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
174
+ ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:532 ; 4-byte Folded Spill
175
+ ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:536 ; 4-byte Folded Spill
176
+ ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:540 ; 4-byte Folded Spill
165
177
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:288
166
178
; GCN-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:292
167
179
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:296
@@ -218,43 +230,45 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
218
230
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], 0 offset:500
219
231
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:504
220
232
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:508
221
- ; GCN-NEXT: s_waitcnt vmcnt(60)
222
- ; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21]
223
- ; GCN-NEXT: s_waitcnt vmcnt(57)
224
- ; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[20:21] offset:16
225
- ; GCN-NEXT: s_waitcnt vmcnt(54)
226
- ; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[20:21] offset:32
227
- ; GCN-NEXT: s_waitcnt vmcnt(51)
228
- ; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[20:21] offset:48
229
- ; GCN-NEXT: s_waitcnt vmcnt(48)
230
- ; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[20:21] offset:64
231
- ; GCN-NEXT: s_waitcnt vmcnt(45)
232
- ; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[20:21] offset:80
233
- ; GCN-NEXT: s_waitcnt vmcnt(42)
234
- ; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[20:21] offset:96
235
- ; GCN-NEXT: s_waitcnt vmcnt(39)
236
- ; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[20:21] offset:112
237
- ; GCN-NEXT: s_waitcnt vmcnt(36)
238
- ; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[20:21] offset:128
239
- ; GCN-NEXT: s_waitcnt vmcnt(33)
240
- ; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[20:21] offset:144
241
- ; GCN-NEXT: s_waitcnt vmcnt(30)
242
- ; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[20:21] offset:160
243
- ; GCN-NEXT: s_waitcnt vmcnt(27)
244
- ; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[20:21] offset:176
245
- ; GCN-NEXT: s_waitcnt vmcnt(24)
246
- ; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[20:21] offset:192
247
- ; GCN-NEXT: s_waitcnt vmcnt(21)
248
- ; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[20:21] offset:208
249
- ; GCN-NEXT: s_waitcnt vmcnt(18)
250
- ; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[20:21] offset:224
251
- ; GCN-NEXT: s_waitcnt vmcnt(15)
252
- ; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[20:21] offset:240
233
+ ; GCN-NEXT: s_nop 0
234
+ ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:512 ; 4-byte Folded Reload
235
+ ; GCN-NEXT: s_nop 0
236
+ ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:516 ; 4-byte Folded Reload
237
+ ; GCN-NEXT: s_nop 0
238
+ ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:520 ; 4-byte Folded Reload
239
+ ; GCN-NEXT: s_nop 0
240
+ ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:524 ; 4-byte Folded Reload
241
+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
242
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
243
+ ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21]
244
+ ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:528 ; 4-byte Folded Reload
245
+ ; GCN-NEXT: s_nop 0
246
+ ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:532 ; 4-byte Folded Reload
247
+ ; GCN-NEXT: s_nop 0
248
+ ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:536 ; 4-byte Folded Reload
249
+ ; GCN-NEXT: s_nop 0
250
+ ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:540 ; 4-byte Folded Reload
251
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
252
+ ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21] offset:16
253
+ ; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[20:21] offset:32
254
+ ; GCN-NEXT: global_store_dwordx4 v0, v[12:15], s[20:21] offset:48
255
+ ; GCN-NEXT: global_store_dwordx4 v0, v[16:19], s[20:21] offset:64
256
+ ; GCN-NEXT: global_store_dwordx4 v0, v[20:23], s[20:21] offset:80
257
+ ; GCN-NEXT: global_store_dwordx4 v0, v[24:27], s[20:21] offset:96
258
+ ; GCN-NEXT: global_store_dwordx4 v0, v[28:31], s[20:21] offset:112
259
+ ; GCN-NEXT: global_store_dwordx4 v0, v[32:35], s[20:21] offset:128
260
+ ; GCN-NEXT: global_store_dwordx4 v0, v[36:39], s[20:21] offset:144
261
+ ; GCN-NEXT: global_store_dwordx4 v0, v[40:43], s[20:21] offset:160
262
+ ; GCN-NEXT: global_store_dwordx4 v0, v[44:47], s[20:21] offset:176
263
+ ; GCN-NEXT: global_store_dwordx4 v0, v[48:51], s[20:21] offset:192
264
+ ; GCN-NEXT: global_store_dwordx4 v0, v[52:55], s[20:21] offset:208
265
+ ; GCN-NEXT: global_store_dwordx4 v0, v[56:59], s[20:21] offset:224
266
+ ; GCN-NEXT: global_store_dwordx4 v0, v[60:63], s[20:21] offset:240
253
267
; GCN-NEXT: s_endpgm
254
268
%vec = load <64 x i32 >, <64 x i32 > addrspace (1 )* %ptr
255
269
%insert = insertelement <64 x i32 > %vec , i32 %val , i32 %idx
256
270
store <64 x i32 > %insert , <64 x i32 > addrspace (1 )* %out.ptr
257
271
ret void
258
272
}
259
273
260
- attributes #0 = { "amdgpu-waves-per-eu" ="1,10" }
274
+ attributes #0 = { "amdgpu-flat-workgroup-size" = "1,256" "amdgpu- waves-per-eu" ="1,10" }
0 commit comments