Skip to content

Commit dd7540f

Browse files
committed
AMDGPU: Handle buffer load/store for 64-bit element types
Note pointers still don't work correctly.
1 parent 90d1bca commit dd7540f

File tree

3 files changed

+189
-0
lines changed

3 files changed

+189
-0
lines changed

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,10 +1429,14 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
14291429
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
14301430
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
14311431
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
1432+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i64, "BUFFER_LOAD_DWORDX2">;
1433+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f64, "BUFFER_LOAD_DWORDX2">;
14321434
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
14331435
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
14341436
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
14351437
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
1438+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i64, "BUFFER_LOAD_DWORDX4">;
1439+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f64, "BUFFER_LOAD_DWORDX4">;
14361440
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
14371441
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
14381442
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
@@ -1495,6 +1499,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_
14951499
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
14961500
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
14971501
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
1502+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
14981503
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">;
14991504
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">;
15001505
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
@@ -1527,12 +1532,16 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
15271532
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
15281533
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
15291534
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
1535+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i64, "BUFFER_STORE_DWORDX2">;
1536+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f64, "BUFFER_STORE_DWORDX2">;
15301537
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
15311538
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
15321539
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
15331540
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
15341541
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
15351542
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
1543+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i64, "BUFFER_STORE_DWORDX4">;
1544+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f64, "BUFFER_STORE_DWORDX4">;
15361545
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
15371546
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
15381547

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,110 @@ main_body:
11261126
ret void
11271127
}
11281128

1129+
define double @buffer_load_f64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
1130+
; PREGFX10-LABEL: buffer_load_f64__voffset_add:
1131+
; PREGFX10: ; %bb.0:
1132+
; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1133+
; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
1134+
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
1135+
; PREGFX10-NEXT: s_setpc_b64 s[30:31]
1136+
;
1137+
; GFX10-LABEL: buffer_load_f64__voffset_add:
1138+
; GFX10: ; %bb.0:
1139+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1140+
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
1141+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1142+
; GFX10-NEXT: s_setpc_b64 s[30:31]
1143+
;
1144+
; GFX11-LABEL: buffer_load_f64__voffset_add:
1145+
; GFX11: ; %bb.0:
1146+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1147+
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60
1148+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1149+
; GFX11-NEXT: s_setpc_b64 s[30:31]
1150+
%voffset.add = add i32 %voffset, 60
1151+
%data = call double @llvm.amdgcn.raw.ptr.buffer.load.f64(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
1152+
ret double %data
1153+
}
1154+
1155+
define <2 x double> @buffer_load_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
1156+
; PREGFX10-LABEL: buffer_load_v2f64__voffset_add:
1157+
; PREGFX10: ; %bb.0:
1158+
; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1159+
; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
1160+
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
1161+
; PREGFX10-NEXT: s_setpc_b64 s[30:31]
1162+
;
1163+
; GFX10-LABEL: buffer_load_v2f64__voffset_add:
1164+
; GFX10: ; %bb.0:
1165+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166+
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
1167+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1168+
; GFX10-NEXT: s_setpc_b64 s[30:31]
1169+
;
1170+
; GFX11-LABEL: buffer_load_v2f64__voffset_add:
1171+
; GFX11: ; %bb.0:
1172+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1173+
; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60
1174+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1175+
; GFX11-NEXT: s_setpc_b64 s[30:31]
1176+
%voffset.add = add i32 %voffset, 60
1177+
%data = call <2 x double> @llvm.amdgcn.raw.ptr.buffer.load.v2f64(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
1178+
ret <2 x double> %data
1179+
}
1180+
1181+
define i64 @buffer_load_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
1182+
; PREGFX10-LABEL: buffer_load_i64__voffset_add:
1183+
; PREGFX10: ; %bb.0:
1184+
; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1185+
; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
1186+
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
1187+
; PREGFX10-NEXT: s_setpc_b64 s[30:31]
1188+
;
1189+
; GFX10-LABEL: buffer_load_i64__voffset_add:
1190+
; GFX10: ; %bb.0:
1191+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1192+
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
1193+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1194+
; GFX10-NEXT: s_setpc_b64 s[30:31]
1195+
;
1196+
; GFX11-LABEL: buffer_load_i64__voffset_add:
1197+
; GFX11: ; %bb.0:
1198+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1199+
; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60
1200+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1201+
; GFX11-NEXT: s_setpc_b64 s[30:31]
1202+
%voffset.add = add i32 %voffset, 60
1203+
%data = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
1204+
ret i64 %data
1205+
}
1206+
1207+
define <2 x i64> @buffer_load_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
1208+
; PREGFX10-LABEL: buffer_load_v2i64__voffset_add:
1209+
; PREGFX10: ; %bb.0:
1210+
; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1211+
; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
1212+
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
1213+
; PREGFX10-NEXT: s_setpc_b64 s[30:31]
1214+
;
1215+
; GFX10-LABEL: buffer_load_v2i64__voffset_add:
1216+
; GFX10: ; %bb.0:
1217+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1218+
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
1219+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1220+
; GFX10-NEXT: s_setpc_b64 s[30:31]
1221+
;
1222+
; GFX11-LABEL: buffer_load_v2i64__voffset_add:
1223+
; GFX11: ; %bb.0:
1224+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1225+
; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60
1226+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1227+
; GFX11-NEXT: s_setpc_b64 s[30:31]
1228+
%voffset.add = add i32 %voffset, 60
1229+
%data = call <2 x i64> @llvm.amdgcn.raw.ptr.buffer.load.v2i64(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
1230+
ret <2 x i64> %data
1231+
}
1232+
11291233
declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #0
11301234
declare <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32) #0
11311235
declare <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32) #0

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,82 @@ define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_swizzled_not_merged(ptr ad
307307
ret void
308308
}
309309

310+
define void @buffer_store_f64__voffset_add(ptr addrspace(8) inreg %rsrc, double %data, i32 %voffset) #0 {
311+
; VERDE-LABEL: buffer_store_f64__voffset_add:
312+
; VERDE: ; %bb.0:
313+
; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314+
; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60
315+
; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0)
316+
; VERDE-NEXT: s_setpc_b64 s[30:31]
317+
;
318+
; CHECK-LABEL: buffer_store_f64__voffset_add:
319+
; CHECK: ; %bb.0:
320+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
321+
; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60
322+
; CHECK-NEXT: s_waitcnt vmcnt(0)
323+
; CHECK-NEXT: s_setpc_b64 s[30:31]
324+
%voffset.add = add i32 %voffset, 60
325+
call void @llvm.amdgcn.raw.ptr.buffer.store.f64(double %data, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
326+
ret void
327+
}
328+
329+
define void @buffer_store_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x double> %data, i32 %voffset) #0 {
330+
; VERDE-LABEL: buffer_store_v2f64__voffset_add:
331+
; VERDE: ; %bb.0:
332+
; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
333+
; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60
334+
; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0)
335+
; VERDE-NEXT: s_setpc_b64 s[30:31]
336+
;
337+
; CHECK-LABEL: buffer_store_v2f64__voffset_add:
338+
; CHECK: ; %bb.0:
339+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
340+
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60
341+
; CHECK-NEXT: s_waitcnt vmcnt(0)
342+
; CHECK-NEXT: s_setpc_b64 s[30:31]
343+
%voffset.add = add i32 %voffset, 60
344+
call void @llvm.amdgcn.raw.ptr.buffer.store.v2f64(<2 x double> %data, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
345+
ret void
346+
}
347+
348+
define void @buffer_store_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i64 %data, i32 %voffset) #0 {
349+
; VERDE-LABEL: buffer_store_i64__voffset_add:
350+
; VERDE: ; %bb.0:
351+
; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352+
; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60
353+
; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0)
354+
; VERDE-NEXT: s_setpc_b64 s[30:31]
355+
;
356+
; CHECK-LABEL: buffer_store_i64__voffset_add:
357+
; CHECK: ; %bb.0:
358+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
359+
; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60
360+
; CHECK-NEXT: s_waitcnt vmcnt(0)
361+
; CHECK-NEXT: s_setpc_b64 s[30:31]
362+
%voffset.add = add i32 %voffset, 60
363+
call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 %data, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
364+
ret void
365+
}
366+
367+
define void @buffer_store_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x i64> %data, i32 %voffset) #0 {
368+
; VERDE-LABEL: buffer_store_v2i64__voffset_add:
369+
; VERDE: ; %bb.0:
370+
; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371+
; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60
372+
; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0)
373+
; VERDE-NEXT: s_setpc_b64 s[30:31]
374+
;
375+
; CHECK-LABEL: buffer_store_v2i64__voffset_add:
376+
; CHECK: ; %bb.0:
377+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378+
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60
379+
; CHECK-NEXT: s_waitcnt vmcnt(0)
380+
; CHECK-NEXT: s_setpc_b64 s[30:31]
381+
%voffset.add = add i32 %voffset, 60
382+
call void @llvm.amdgcn.raw.ptr.buffer.store.v2i64(<2 x i64> %data, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
383+
ret void
384+
}
385+
310386
declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32) #0
311387
declare void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32) #0
312388
declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) #0

0 commit comments

Comments
 (0)