@@ -169,6 +169,62 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
169
169
ret <4 x float > %val
170
170
}
171
171
172
+ define amdgpu_ps <4 x i32 > @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32 (<4 x i32 > inreg %rsrc , i32 %voffset , i32 inreg %soffset ) {
173
+ ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
174
+ ; GFX8: bb.1 (%ir-block.0):
175
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
176
+ ; GFX8-NEXT: {{ $}}
177
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
178
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
179
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
180
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
181
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
182
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
183
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
184
+ ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
185
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
186
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
187
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
188
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
189
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
190
+ ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
191
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
192
+ ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
193
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
194
+ ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
195
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
196
+ ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
197
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
198
+ ;
199
+ ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
200
+ ; GFX12: bb.1 (%ir-block.0):
201
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
202
+ ; GFX12-NEXT: {{ $}}
203
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
204
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
205
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
206
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
207
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
208
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
209
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
210
+ ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
211
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
212
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
213
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
214
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
215
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
216
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
217
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
218
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
219
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
220
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
221
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
222
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
223
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
224
+ %val = call <4 x i32 > @llvm.amdgcn.raw.buffer.load.format.v4i32 (<4 x i32 > %rsrc , i32 %voffset , i32 %soffset , i32 0 )
225
+ ret <4 x i32 > %val
226
+ }
227
+
172
228
; Waterfall for rsrc and soffset, copy for voffset
173
229
define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset (<4 x i32 > %rsrc , i32 inreg %voffset , i32 %soffset ) {
174
230
; GFX8-LABEL: name: raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset
@@ -325,9 +381,68 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
325
381
ret <4 x float > %val
326
382
}
327
383
384
+ define amdgpu_ps <4 x i32 > @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32 (<4 x i32 > inreg %rsrc , i32 %voffset.base , i32 inreg %soffset ) {
385
+ ; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
386
+ ; GFX8: bb.1 (%ir-block.0):
387
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
388
+ ; GFX8-NEXT: {{ $}}
389
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
390
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
391
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
392
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
393
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
394
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
395
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
396
+ ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
397
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
398
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
399
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
400
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
401
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
402
+ ; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
403
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
404
+ ; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
405
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
406
+ ; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
407
+ ; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
408
+ ; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
409
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
410
+ ;
411
+ ; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
412
+ ; GFX12: bb.1 (%ir-block.0):
413
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
414
+ ; GFX12-NEXT: {{ $}}
415
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
416
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
417
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
418
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
419
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
420
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
421
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
422
+ ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
423
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
424
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
425
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
426
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
427
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
428
+ ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
429
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
430
+ ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
431
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
432
+ ; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
433
+ ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
434
+ ; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
435
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
436
+ %voffset = add i32 %voffset.base , 4095
437
+ %val = call <4 x i32 > @llvm.amdgcn.raw.buffer.load.format.v4i32 (<4 x i32 > %rsrc , i32 %voffset , i32 %soffset , i32 0 )
438
+ ret <4 x i32 > %val
439
+ }
440
+
441
+
328
442
declare float @llvm.amdgcn.raw.buffer.load.format.f32 (<4 x i32 >, i32 , i32 , i32 immarg) #0
329
443
declare <2 x float > @llvm.amdgcn.raw.buffer.load.format.v2f32 (<4 x i32 >, i32 , i32 , i32 immarg) #0
330
444
declare <3 x float > @llvm.amdgcn.raw.buffer.load.format.v3f32 (<4 x i32 >, i32 , i32 , i32 immarg) #0
331
445
declare <4 x float > @llvm.amdgcn.raw.buffer.load.format.v4f32 (<4 x i32 >, i32 , i32 , i32 immarg) #0
446
+ declare <4 x i32 > @llvm.amdgcn.raw.buffer.load.format.v4i32 (<4 x i32 >, i32 , i32 , i32 immarg) #0
332
447
333
448
attributes #0 = { nounwind readonly }
0 commit comments