Skip to content

AMDGPU: Handle gfx950 96/128-bit buffer_load_lds #116681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1674,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
[],
[llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>, // LDS base offset
llvm_i32_ty, // Data byte size: 1/2/4
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
Expand All @@ -1693,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
[],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
LLVMQualPointerType<3>, // LDS base offset
llvm_i32_ty, // Data byte size: 1/2/4
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
Expand All @@ -1715,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
[],
[llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>, // LDS base offset
llvm_i32_ty, // Data byte size: 1/2/4
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
Expand All @@ -1735,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
[],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
LLVMQualPointerType<3>, // LDS base offset
llvm_i32_ty, // Data byte size: 1/2/4
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3240,6 +3240,24 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
break;
case 12:
if (!Subtarget->hasLDSLoadB96_B128())
return false;

Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
break;
case 16:
if (!Subtarget->hasLDSLoadB96_B128())
return false;

Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
break;
}

MachineBasicBlock *MBB = MI.getParent();
Expand Down
24 changes: 16 additions & 8 deletions llvm/lib/Target/AMDGPU/BUFInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -573,9 +573,17 @@ multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
}
}

multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> {
multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, Predicate LDSPred = TruePredicate> {
defm NAME : MUBUF_Pseudo_Loads<opName, load_vt>;
defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;

if !ne(LDSPred, TruePredicate) then {
let SubtargetPredicate = LDSPred in {
defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
}
} else {
defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
}

}

multiclass MUBUF_Pseudo_Loads_LDSOpc<string opName,
Expand Down Expand Up @@ -956,11 +964,11 @@ defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
"buffer_load_dwordx2", v2i32
>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
"buffer_load_dwordx3", v3i32
defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads_Lds <
"buffer_load_dwordx3", v3i32, /*LDSPred=*/HasGFX950Insts
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
"buffer_load_dwordx4", v4i32
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads_Lds <
"buffer_load_dwordx4", v4i32, /*LDSPred=*/HasGFX950Insts
>;

defm BUFFER_LOAD_LDS_B32 : MUBUF_Pseudo_Loads_LDSOpc <
Expand Down Expand Up @@ -3231,8 +3239,8 @@ defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_vi <0x12>;
defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_vi <0x13>;
defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_vi <0x14>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_Lds_vi <0x16>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_Lds_vi <0x17>;
defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>;
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>;
defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>;
Expand Down
16 changes: 16 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9825,6 +9825,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
break;
case 12:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
break;
case 16:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
break;
}

SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s

; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s

; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.load.lds

; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.load.lds),


declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)

;---------------------------------------------------------------------y
Expand Down
176 changes: 176 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s

; FIXME: Not a great error
; ERR-SDAG: LLVM ERROR: Do not know how to expand this operator's operand!
; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.ptr.buffer.load.lds),

declare void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)

;---------------------------------------------------------------------y
; dwordx3
;---------------------------------------------------------------------

define amdgpu_ps float @buffer_load_lds_dwordx3(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
; GFX950-LABEL: buffer_load_lds_dwordx3:
; GFX950: ; %bb.0: ; %main_body
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 offset:4 sc0 lds
; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 offset:8 nt lds
; GFX950-NEXT: v_mov_b32_e32 v0, s4
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ds_read_b32 v0, v0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; return to shader part epilog
main_body:
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
%res = load float, ptr addrspace(3) %lds
ret float %res
}

define amdgpu_ps void @buffer_load_lds_dwordx3_imm_voffset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
; GFX950-LABEL: buffer_load_lds_dwordx3_imm_voffset:
; GFX950: ; %bb.0:
; GFX950-NEXT: v_mov_b32_e32 v0, 0x800
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 2048, i32 0, i32 0, i32 0)
ret void
}

define amdgpu_ps void @buffer_load_lds_dwordx3_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset) {
; GFX950-LABEL: buffer_load_lds_dwordx3_v_offset:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %voffset, i32 0, i32 0, i32 0)
ret void
}

define amdgpu_ps void @buffer_load_lds_dwordx3_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 inreg %soffset) {
; GFX950-LABEL: buffer_load_lds_dwordx3_s_offset:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx3 off, s[0:3], s5 lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 0, i32 %soffset, i32 0, i32 0)
ret void
}

define amdgpu_ps void @buffer_load_lds_dwordx3_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
; GFX950-LABEL: buffer_load_lds_dwordx3_vs_offset:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], s5 offen lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %voffset, i32 %soffset, i32 0, i32 0)
ret void
}

define amdgpu_ps void @buffer_load_lds_dwordx3_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
; GFX950-LABEL: buffer_load_lds_dwordx3_vs_imm_offset:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], s5 offen offset:2048 lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %voffset, i32 %soffset, i32 2048, i32 0)
ret void
}

;---------------------------------------------------------------------y
; dwordx4
;---------------------------------------------------------------------

define amdgpu_ps float @buffer_load_lds_dwordx4(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
; GFX950-LABEL: buffer_load_lds_dwordx4:
; GFX950: ; %bb.0: ; %main_body
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 offset:4 sc0 lds
; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 offset:8 nt lds
; GFX950-NEXT: v_mov_b32_e32 v0, s4
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: ds_read_b32 v0, v0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; return to shader part epilog
main_body:
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
%res = load float, ptr addrspace(3) %lds
ret float %res
}

define amdgpu_ps void @buffer_load_lds_dwordx4_imm_voffset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
; GFX950-LABEL: buffer_load_lds_dwordx4_imm_voffset:
; GFX950: ; %bb.0:
; GFX950-NEXT: v_mov_b32_e32 v0, 0x800
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 2048, i32 0, i32 0, i32 0)
ret void
}

define amdgpu_ps void @buffer_load_lds_dwordx4_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset) {
; GFX950-LABEL: buffer_load_lds_dwordx4_v_offset:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %voffset, i32 0, i32 0, i32 0)
ret void
}

define amdgpu_ps void @buffer_load_lds_dwordx4_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 inreg %soffset) {
; GFX950-LABEL: buffer_load_lds_dwordx4_s_offset:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx4 off, s[0:3], s5 lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 0, i32 %soffset, i32 0, i32 0)
ret void
}

define amdgpu_ps void @buffer_load_lds_dwordx4_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
; GFX950-LABEL: buffer_load_lds_dwordx4_vs_offset:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], s5 offen lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %voffset, i32 %soffset, i32 0, i32 0)
ret void
}

define amdgpu_ps void @buffer_load_lds_dwordx4_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
; GFX950-LABEL: buffer_load_lds_dwordx4_vs_imm_offset:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_mov_b32 m0, s4
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], s5 offen offset:2048 lds
; GFX950-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %voffset, i32 %soffset, i32 2048, i32 0)
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX950-GISEL: {{.*}}
; GFX950-SDAG: {{.*}}
Loading
Loading