-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Legalize and select raw/struct_buffer_load with tfe #93310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Mirko Brkušanin (mbrkusanin) ChangesPatch is 335.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93310.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 152f495a452ba..231db188e65dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -250,6 +250,11 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE, SIbuffer_load_ubyte>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT, SIbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE, SIbuffer_load_byte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_TFE, SIbuffer_load_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT_TFE, SIbuffer_load_ushort_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, SIbuffer_load_ubyte_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT_TFE, SIbuffer_load_short_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE_TFE, SIbuffer_load_byte_tfe>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT, SIbuffer_load_format>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_TFE, SIbuffer_load_format_tfe>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_D16, SIbuffer_load_format_d16>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 980e58510ceb7..375643b7f5197 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5529,6 +5529,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
NODE_NAME_CASE(BUFFER_LOAD_BYTE)
NODE_NAME_CASE(BUFFER_LOAD_SHORT)
+ NODE_NAME_CASE(BUFFER_LOAD_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 3814b56a4d56a..71c4334029b43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -582,6 +582,11 @@ enum NodeType : unsigned {
BUFFER_LOAD_USHORT,
BUFFER_LOAD_BYTE,
BUFFER_LOAD_SHORT,
+ BUFFER_LOAD_TFE,
+ BUFFER_LOAD_UBYTE_TFE,
+ BUFFER_LOAD_USHORT_TFE,
+ BUFFER_LOAD_BYTE_TFE,
+ BUFFER_LOAD_SHORT_TFE,
BUFFER_LOAD_FORMAT,
BUFFER_LOAD_FORMAT_TFE,
BUFFER_LOAD_FORMAT_D16,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a771b421e77a4..ee7fb20c23aa7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5870,17 +5870,18 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
: AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
}
} else {
- if (IsTFE)
- return false;
switch (MemTy.getSizeInBits()) {
case 8:
- Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
break;
case 16:
- Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
break;
default:
- Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD;
break;
}
}
@@ -5892,7 +5893,11 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
- if (NumValueDWords == 1) {
+ if (MemTy.getSizeInBits() < 32) {
+ Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
+ B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
+ B.buildTrunc(Dst, ExtDst);
+ } else if (NumValueDWords == 1) {
B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
} else {
SmallVector<Register, 5> LoadElts;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dbb42a60f71fe..7ebd674757fbc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3041,6 +3041,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
@@ -4323,6 +4328,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 8eaa113ac1816..1fbebc038c189 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1434,6 +1434,15 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v2i32, "BUFFER_LOAD_DWORD_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v3i32, "BUFFER_LOAD_DWORDX2_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v4i32, "BUFFER_LOAD_DWORDX3_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v5i32, "BUFFER_LOAD_DWORDX4_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte_tfe, v2i32, "BUFFER_LOAD_SBYTE_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short_tfe, v2i32, "BUFFER_LOAD_SSHORT_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte_tfe, v2i32, "BUFFER_LOAD_UBYTE_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort_tfe, v2i32, "BUFFER_LOAD_USHORT_TFE">;
+
multiclass MUBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_store<name, memoryVt>);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1d2a5fff23568..b0f6e78709b1a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5962,16 +5962,10 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
assert(M->getNumValues() == 2 || M->getNumValues() == 3);
bool IsTFE = M->getNumValues() == 3;
- unsigned Opc;
- if (IsFormat) {
- Opc = IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
- : AMDGPUISD::BUFFER_LOAD_FORMAT;
- } else {
- // TODO: Support non-format TFE loads.
- if (IsTFE)
- return SDValue();
- Opc = AMDGPUISD::BUFFER_LOAD;
- }
+ unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
+ : AMDGPUISD::BUFFER_LOAD_FORMAT)
+ : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
+ : AMDGPUISD::BUFFER_LOAD;
if (IsD16) {
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
@@ -5979,7 +5973,8 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
+ IsTFE);
if (isTypeLegal(LoadVT)) {
return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
@@ -10172,11 +10167,30 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
}
// Handle 8 bit and 16 bit buffer loads
-SDValue
-SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
- SDLoc DL, ArrayRef<SDValue> Ops,
- MachineMemOperand *MMO) const {
+SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
+ EVT LoadVT, SDLoc DL,
+ ArrayRef<SDValue> Ops,
+ MachineMemOperand *MMO,
+ bool IsTFE) const {
EVT IntVT = LoadVT.changeTypeToInteger();
+
+ if (IsTFE) {
+ unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
+ ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
+ : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
+ SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
+ SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
+ SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
+ DAG.getVectorIdxConstant(1, DL));
+ SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
+ SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
+ return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
+ }
+
unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index fed73f48840fd..292b17da93583 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -275,7 +275,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
ArrayRef<SDValue> Ops,
- MachineMemOperand *MMO) const;
+ MachineMemOperand *MMO,
+ bool IsTFE = false) const;
// Handle 8 bit and 16 bit buffer stores
SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 0ed2f60ea66a7..fd119e0992e56 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -148,6 +148,16 @@ def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ubyte_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ushort_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_byte_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_short_tfe: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_TFE", SDTBufferLoad,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e7aeaa017306c..d1667955f83db 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3762,6 +3762,11 @@ def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_UBYTE_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SBYTE_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_USHORT_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SSHORT_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_TFE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
new file mode 100644
index 0000000000000..8eb05bb9565f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -0,0 +1,1515 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX6
+; RUN: llc -global-isel -mcpu=hawaii -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX7
+; RUN: llc -global-isel -mcpu=fiji -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX8
+; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1100 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -global-isel -mcpu=gfx1200 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+
+define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
+ ; GFX67-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX67: bb.1 (%ir-block.0):
+ ; GFX67-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX67-NEXT: {{ $}}
+ ; GFX67-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX67-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX67-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX67-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX67-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX67-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX67-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX67-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX67-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX67-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX67-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX67-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX67-NEXT: [[BUFFER_LOAD_UBYTE_TFE_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX67-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub0
+ ; GFX67-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub1
+ ; GFX67-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX67-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+ ; GFX67-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+ ; GFX67-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX67-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE3]], %subreg.sub2_sub3
+ ; GFX67-NEXT: BUFFER_STORE_BYTE_ADDR64 [[COPY8]], [[REG_SEQUENCE1]], [[REG_SEQUENCE4]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX67-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX67-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+ ; GFX67-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_3]], %subreg.sub0, [[S_MOV_B32_4]], %subreg.sub1
+ ; GFX67-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX67-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_1]], %subreg.sub0_sub1, [[REG_SEQUENCE5]], %subreg.sub2_sub3
+ ; GFX67-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY9]], [[REG_SEQUENCE2]], [[REG_SEQUENCE6]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX67-NEXT: S_ENDPGM 0
+ ;
+ ; GFX8-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX8-NEXT: [[BUFFER_LOAD_UBYTE_TFE_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub0
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub1
+ ; GFX8-NEXT: FLAT_STORE_BYTE [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX8-NEXT: S_ENDPGM 0
+ ;
+ ; GFX910-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX910: bb.1 (%ir-block.0):
+ ; GFX910-NEXT: liveins: $sgpr0, $...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: Mirko Brkušanin (mbrkusanin) ChangesPatch is 335.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93310.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 152f495a452ba..231db188e65dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -250,6 +250,11 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE, SIbuffer_load_ubyte>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT, SIbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE, SIbuffer_load_byte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_TFE, SIbuffer_load_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT_TFE, SIbuffer_load_ushort_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, SIbuffer_load_ubyte_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT_TFE, SIbuffer_load_short_tfe>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE_TFE, SIbuffer_load_byte_tfe>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT, SIbuffer_load_format>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_TFE, SIbuffer_load_format_tfe>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_D16, SIbuffer_load_format_d16>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 980e58510ceb7..375643b7f5197 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5529,6 +5529,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
NODE_NAME_CASE(BUFFER_LOAD_BYTE)
NODE_NAME_CASE(BUFFER_LOAD_SHORT)
+ NODE_NAME_CASE(BUFFER_LOAD_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
+ NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 3814b56a4d56a..71c4334029b43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -582,6 +582,11 @@ enum NodeType : unsigned {
BUFFER_LOAD_USHORT,
BUFFER_LOAD_BYTE,
BUFFER_LOAD_SHORT,
+ BUFFER_LOAD_TFE,
+ BUFFER_LOAD_UBYTE_TFE,
+ BUFFER_LOAD_USHORT_TFE,
+ BUFFER_LOAD_BYTE_TFE,
+ BUFFER_LOAD_SHORT_TFE,
BUFFER_LOAD_FORMAT,
BUFFER_LOAD_FORMAT_TFE,
BUFFER_LOAD_FORMAT_D16,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a771b421e77a4..ee7fb20c23aa7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5870,17 +5870,18 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
: AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
}
} else {
- if (IsTFE)
- return false;
switch (MemTy.getSizeInBits()) {
case 8:
- Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
break;
case 16:
- Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
break;
default:
- Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
+ Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD;
break;
}
}
@@ -5892,7 +5893,11 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
- if (NumValueDWords == 1) {
+ if (MemTy.getSizeInBits() < 32) {
+ Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
+ B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
+ B.buildTrunc(Dst, ExtDst);
+ } else if (NumValueDWords == 1) {
B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
} else {
SmallVector<Register, 5> LoadElts;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dbb42a60f71fe..7ebd674757fbc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3041,6 +3041,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
@@ -4323,6 +4328,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 8eaa113ac1816..1fbebc038c189 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1434,6 +1434,15 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v2i32, "BUFFER_LOAD_DWORD_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v3i32, "BUFFER_LOAD_DWORDX2_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v4i32, "BUFFER_LOAD_DWORDX3_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_tfe, v5i32, "BUFFER_LOAD_DWORDX4_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte_tfe, v2i32, "BUFFER_LOAD_SBYTE_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short_tfe, v2i32, "BUFFER_LOAD_SSHORT_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte_tfe, v2i32, "BUFFER_LOAD_UBYTE_TFE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort_tfe, v2i32, "BUFFER_LOAD_USHORT_TFE">;
+
multiclass MUBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_store<name, memoryVt>);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1d2a5fff23568..b0f6e78709b1a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5962,16 +5962,10 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
assert(M->getNumValues() == 2 || M->getNumValues() == 3);
bool IsTFE = M->getNumValues() == 3;
- unsigned Opc;
- if (IsFormat) {
- Opc = IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
- : AMDGPUISD::BUFFER_LOAD_FORMAT;
- } else {
- // TODO: Support non-format TFE loads.
- if (IsTFE)
- return SDValue();
- Opc = AMDGPUISD::BUFFER_LOAD;
- }
+ unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
+ : AMDGPUISD::BUFFER_LOAD_FORMAT)
+ : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
+ : AMDGPUISD::BUFFER_LOAD;
if (IsD16) {
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
@@ -5979,7 +5973,8 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
+ IsTFE);
if (isTypeLegal(LoadVT)) {
return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
@@ -10172,11 +10167,30 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
}
// Handle 8 bit and 16 bit buffer loads
-SDValue
-SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
- SDLoc DL, ArrayRef<SDValue> Ops,
- MachineMemOperand *MMO) const {
+SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
+ EVT LoadVT, SDLoc DL,
+ ArrayRef<SDValue> Ops,
+ MachineMemOperand *MMO,
+ bool IsTFE) const {
EVT IntVT = LoadVT.changeTypeToInteger();
+
+ if (IsTFE) {
+ unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
+ ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
+ : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
+ SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
+ SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
+ SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
+ DAG.getVectorIdxConstant(1, DL));
+ SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
+ SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
+ return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
+ }
+
unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index fed73f48840fd..292b17da93583 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -275,7 +275,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
ArrayRef<SDValue> Ops,
- MachineMemOperand *MMO) const;
+ MachineMemOperand *MMO,
+ bool IsTFE = false) const;
// Handle 8 bit and 16 bit buffer stores
SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 0ed2f60ea66a7..fd119e0992e56 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -148,6 +148,16 @@ def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ubyte_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ushort_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_byte_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_short_tfe: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT_TFE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_TFE", SDTBufferLoad,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e7aeaa017306c..d1667955f83db 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3762,6 +3762,11 @@ def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_UBYTE_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SBYTE_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_USHORT_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SSHORT_TFE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_TFE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
new file mode 100644
index 0000000000000..8eb05bb9565f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -0,0 +1,1515 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX6
+; RUN: llc -global-isel -mcpu=hawaii -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX67,GFX7
+; RUN: llc -global-isel -mcpu=fiji -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX8
+; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX910
+; RUN: llc -global-isel -mcpu=gfx1100 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -global-isel -mcpu=gfx1200 -mtriple=amdgcn-- -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+
+define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
+ ; GFX67-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX67: bb.1 (%ir-block.0):
+ ; GFX67-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX67-NEXT: {{ $}}
+ ; GFX67-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX67-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX67-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX67-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX67-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX67-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX67-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX67-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX67-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX67-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX67-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX67-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX67-NEXT: [[BUFFER_LOAD_UBYTE_TFE_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX67-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub0
+ ; GFX67-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub1
+ ; GFX67-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX67-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+ ; GFX67-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1
+ ; GFX67-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX67-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE3]], %subreg.sub2_sub3
+ ; GFX67-NEXT: BUFFER_STORE_BYTE_ADDR64 [[COPY8]], [[REG_SEQUENCE1]], [[REG_SEQUENCE4]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX67-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX67-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+ ; GFX67-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_3]], %subreg.sub0, [[S_MOV_B32_4]], %subreg.sub1
+ ; GFX67-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX67-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_1]], %subreg.sub0_sub1, [[REG_SEQUENCE5]], %subreg.sub2_sub3
+ ; GFX67-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY9]], [[REG_SEQUENCE2]], [[REG_SEQUENCE6]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX67-NEXT: S_ENDPGM 0
+ ;
+ ; GFX8-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX8-NEXT: [[BUFFER_LOAD_UBYTE_TFE_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub0
+ ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_OFFSET]].sub1
+ ; GFX8-NEXT: FLAT_STORE_BYTE [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX8-NEXT: S_ENDPGM 0
+ ;
+ ; GFX910-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX910: bb.1 (%ir-block.0):
+ ; GFX910-NEXT: liveins: $sgpr0, $...
[truncated]
|
96023b9
to
bdd4da1
Compare
No description provided.