Skip to content

Commit 40edb0a

Browse files
authored
[AMDGPU] llvm.amdgcn.raw.buffer.load.format intrinsic supports v4i32 as return type. (#116067)
1 parent 0341da5 commit 40edb0a

File tree

3 files changed

+189
-3
lines changed

3 files changed

+189
-3
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,8 +1148,8 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
11481148
// Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved
11491149
// through to final assembly selection and is used to signal that the buffer
11501150
// operation is volatile.
1151-
class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1152-
[data_ty],
1151+
class AMDGPURawBufferLoad : DefaultAttrsIntrinsic <
1152+
[llvm_any_ty],
11531153
[llvm_v4i32_ty, // rsrc(SGPR)
11541154
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
11551155
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
@@ -1162,7 +1162,7 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
11621162
// all: volatile op (bit 31, stripped at lowering)
11631163
[IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
11641164
AMDGPURsrcIntrinsic<0>;
1165-
def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
1165+
def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad;
11661166
def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
11671167

11681168
class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,62 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
169169
ret <4 x float> %val
170170
}
171171

172+
define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
173+
; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
174+
; GFX8: bb.1 (%ir-block.0):
175+
; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
176+
; GFX8-NEXT: {{ $}}
177+
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
178+
; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
179+
; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
180+
; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
181+
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
182+
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
183+
; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
184+
; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
185+
; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
186+
; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
187+
; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
188+
; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
189+
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
190+
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
191+
; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
192+
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
193+
; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
194+
; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
195+
; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
196+
; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
197+
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
198+
;
199+
; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_v4i32
200+
; GFX12: bb.1 (%ir-block.0):
201+
; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
202+
; GFX12-NEXT: {{ $}}
203+
; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
204+
; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
205+
; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
206+
; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
207+
; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
208+
; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
209+
; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
210+
; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
211+
; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
212+
; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
213+
; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
214+
; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
215+
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
216+
; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
217+
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
218+
; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
219+
; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
220+
; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
221+
; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
222+
; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
223+
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
224+
%val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
225+
ret <4 x i32> %val
226+
}
227+
172228
; Waterfall for rsrc and soffset, copy for voffset
173229
define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
174230
; GFX8-LABEL: name: raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset
@@ -325,9 +381,68 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
325381
ret <4 x float> %val
326382
}
327383

384+
define amdgpu_ps <4 x i32> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32(<4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
385+
; GFX8-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
386+
; GFX8: bb.1 (%ir-block.0):
387+
; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
388+
; GFX8-NEXT: {{ $}}
389+
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
390+
; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
391+
; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
392+
; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
393+
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
394+
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
395+
; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
396+
; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
397+
; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
398+
; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
399+
; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2
400+
; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub3
401+
; GFX8-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
402+
; GFX8-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
403+
; GFX8-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
404+
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
405+
; GFX8-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
406+
; GFX8-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
407+
; GFX8-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
408+
; GFX8-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
409+
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
410+
;
411+
; GFX12-LABEL: name: raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add_4095_v4i32
412+
; GFX12: bb.1 (%ir-block.0):
413+
; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
414+
; GFX12-NEXT: {{ $}}
415+
; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
416+
; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
417+
; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
418+
; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
419+
; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
420+
; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
421+
; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
422+
; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
423+
; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub0
424+
; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub1
425+
; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub2
426+
; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_OFFEN]].sub3
427+
; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
428+
; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
429+
; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
430+
; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
431+
; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
432+
; GFX12-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
433+
; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
434+
; GFX12-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
435+
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
436+
%voffset = add i32 %voffset.base, 4095
437+
%val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
438+
ret <4 x i32> %val
439+
}
440+
441+
328442
declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32 immarg) #0
329443
declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32 immarg) #0
330444
declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32 immarg) #0
331445
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32 immarg) #0
446+
declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32 immarg) #0
332447

333448
attributes #0 = { nounwind readonly }

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,25 @@ main_body:
1717
ret {<4 x float>, <4 x float>, <4 x float>} %r2
1818
}
1919

20+
;CHECK-LABEL: {{^}}buffer_load_v4i32:
21+
;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
22+
;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc
23+
;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc
24+
;CHECK: s_waitcnt
25+
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_v4i32(<4 x i32> inreg) {
26+
main_body:
27+
%data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0)
28+
%data_glc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 1)
29+
%data_slc = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 0, i32 0, i32 2)
30+
%fdata = bitcast <4 x i32> %data to <4 x float>
31+
%fdata_glc = bitcast <4 x i32> %data_glc to <4 x float>
32+
%fdata_slc = bitcast <4 x i32> %data_slc to <4 x float>
33+
%r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %fdata, 0
34+
%r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %fdata_glc, 1
35+
%r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %fdata_slc, 2
36+
ret {<4 x float>, <4 x float>, <4 x float>} %r2
37+
}
38+
2039
;CHECK-LABEL: {{^}}buffer_load_immoffs:
2140
;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
2241
;CHECK: s_waitcnt
@@ -26,6 +45,16 @@ main_body:
2645
ret <4 x float> %data
2746
}
2847

48+
;CHECK-LABEL: {{^}}buffer_load_immoffs_v4i32:
49+
;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
50+
;CHECK: s_waitcnt
51+
define amdgpu_ps <4 x float> @buffer_load_immoffs_v4i32(<4 x i32> inreg) {
52+
main_body:
53+
%data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 42, i32 0, i32 0)
54+
%fdata = bitcast <4 x i32> %data to <4 x float>
55+
ret <4 x float> %fdata
56+
}
57+
2958
;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
3059
;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
3160
;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
@@ -43,6 +72,26 @@ main_body:
4372
ret <4 x float> %data
4473
}
4574

75+
;CHECK-LABEL: {{^}}buffer_load_immoffs_large_v4i32:
76+
;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
77+
;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
78+
;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092
79+
;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
80+
;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4
81+
;CHECK: s_waitcnt
82+
define amdgpu_ps <4 x float> @buffer_load_immoffs_large_v4i32(<4 x i32> inreg) {
83+
main_body:
84+
%d.0 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 60, i32 0)
85+
%d.1 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4092, i32 32764, i32 0)
86+
%d.2 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 4, i32 36860, i32 0)
87+
%fd.0 = bitcast <4 x i32> %d.0 to <4 x float>
88+
%fd.1 = bitcast <4 x i32> %d.1 to <4 x float>
89+
%fd.2 = bitcast <4 x i32> %d.2 to <4 x float>
90+
%d.3 = fadd <4 x float> %fd.0, %fd.1
91+
%data = fadd <4 x float> %fd.2, %d.3
92+
ret <4 x float> %data
93+
}
94+
4695
;CHECK-LABEL: {{^}}buffer_load_ofs:
4796
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
4897
;CHECK: s_waitcnt
@@ -52,6 +101,16 @@ main_body:
52101
ret <4 x float> %data
53102
}
54103

104+
;CHECK-LABEL: {{^}}buffer_load_ofs_v4i32:
105+
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
106+
;CHECK: s_waitcnt
107+
define amdgpu_ps <4 x float> @buffer_load_ofs_v4i32(<4 x i32> inreg, i32) {
108+
main_body:
109+
%data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %1, i32 0, i32 0)
110+
%fdata = bitcast <4 x i32> %data to <4 x float>
111+
ret <4 x float> %fdata
112+
}
113+
55114
;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
56115
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
57116
;CHECK: s_waitcnt
@@ -62,6 +121,17 @@ main_body:
62121
ret <4 x float> %data
63122
}
64123

124+
;CHECK-LABEL: {{^}}buffer_load_ofs_imm_v4i32:
125+
;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
126+
;CHECK: s_waitcnt
127+
define amdgpu_ps <4 x float> @buffer_load_ofs_imm_v4i32(<4 x i32> inreg, i32) {
128+
main_body:
129+
%ofs = add i32 %1, 60
130+
%data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %0, i32 %ofs, i32 0, i32 0)
131+
%fdata = bitcast <4 x i32> %data to <4 x float>
132+
ret <4 x float> %fdata
133+
}
134+
65135
;CHECK-LABEL: {{^}}buffer_load_x:
66136
;CHECK: buffer_load_format_x v0, off, s[0:3], 0
67137
;CHECK: s_waitcnt
@@ -83,5 +153,6 @@ main_body:
83153
declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #0
84154
declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #0
85155
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0
156+
declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32) #0
86157

87158
attributes #0 = { nounwind readonly }

0 commit comments

Comments
 (0)