-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Fix using wrong memory type for non-image resource intrinsics #94911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Fix using wrong memory type for non-image resource intrinsics #94911
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesAn 8 x i16 raw load was incorrectly using a 64-bit memory type, which would assert in the MachineMemOperand constructor. This is preparation for a cleanup which will make the buffer intrinsics work for all legal types. Full diff: https://github.com/llvm/llvm-project/pull/94911.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index b05834e5803a2..9ff58e339fe9f 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1433,6 +1433,8 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8f16, "BUFFER_LOAD_DWORDX4">;
+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
@@ -1533,6 +1535,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8f16, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d8667affdb4d..53985ae1303fe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1200,9 +1200,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOVolatile;
Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
- unsigned MaxNumLanes = 4;
-
if (RsrcIntr->IsImage) {
+ unsigned MaxNumLanes = 4;
+
const AMDGPU::ImageDimIntrinsicInfo *Intr
= AMDGPU::getImageDimIntrinsicInfo(IntrID);
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
@@ -1215,9 +1215,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
= cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
}
- }
- Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
+ Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
+ } else {
+ Info.memVT = memVTFromLoadIntrReturn(
+ CI.getType(), std::numeric_limits<unsigned>::max());
+ }
// FIXME: What does alignment mean for an image?
Info.opc = ISD::INTRINSIC_W_CHAIN;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll
new file mode 100644
index 0000000000000..efaee6feebaaf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=finalize-isel -o - %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_ps void @raw_ptr_buffer_load_v8f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
+ ; GCN-LABEL: name: raw_ptr_buffer_load_v8f16
+ ; GCN: bb.0 (%ir-block.0):
+ ; GCN-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed [[REG_SEQUENCE2]], killed [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.rsrc, align 1, addrspace 8)
+ ; GCN-NEXT: DS_WRITE_B128_gfx9 [[COPY]], killed [[BUFFER_LOAD_DWORDX4_OFFSET]], 0, 0, implicit $exec :: (store (s128) into %ir.ptr, addrspace 3)
+ ; GCN-NEXT: S_ENDPGM 0
+ %val = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+ store <8 x half> %val, ptr addrspace(3) %ptr
+ ret void
+}
+
+define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x half> %data, i32 %offset) {
+ ; GCN-LABEL: name: buffer_store_v8f16
+ ; GCN: bb.0 (%ir-block.0):
+ ; GCN-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GCN-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]]
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[COPY13]], [[COPY]], killed [[REG_SEQUENCE2]], killed [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8)
+ ; GCN-NEXT: S_ENDPGM 0
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
|
Is that a typo? The patch only touches 8 x f16. |
The test is for f16, but it's any 16-bit element type. The test (and tablegen part) is mostly filler because I'm fixing a bunch of these in a later patch |
df5d6b4
to
acf9c6b
Compare
ping |
An 8 x i16/f16/bf16 raw load was incorrectly using a 64-bit memory type, which would assert in the MachineMemOperand constructor. This is preparation for a cleanup which will make the buffer intrinsics work for all legal types.
acf9c6b
to
f564151
Compare
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…llvm#94911) An 8 x i16 raw load was incorrectly using a 64-bit memory type, which would assert in the MachineMemOperand constructor. This is preparation for a cleanup which will make the buffer intrinsics work for all legal types.
…llvm#94911) An 8 x i16 raw load was incorrectly using a 64-bit memory type, which would assert in the MachineMemOperand constructor. This is preparation for a cleanup which will make the buffer intrinsics work for all legal types. Change-Id: Ia916725b45151f3adac6d1c2a2fbc06302c4c131
An 8 x i16 raw load was incorrectly using a 64-bit memory type, which would assert in the MachineMemOperand constructor.
This is preparation for a cleanup which will make the buffer intrinsics work for all legal types.