-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16][CodeGen] reopen "FLAT_load using D16 pseudo instruction" #127673
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][CodeGen] reopen "FLAT_load using D16 pseudo instruction" #127673
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesPrevious patch is merged #114500 and it hit a buildbot failure and thus reverted It seems the AMDGPU::OpName::Operand_Last is removed before previous patch is merged and that's causing the compile error. Fixed and reopen it here Patch is 22.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127673.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 38272b9d4840d..895d1e77bf1c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -114,9 +114,63 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
llvm_unreachable("unknown operand type");
}
-void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+// Lower true16 D16 Pseudo instruction to d16_lo/d16_hi MCInst based on
+// Dst/Data's .l/.h selection
+void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI,
+ MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ const auto *Info = AMDGPU::getT16D16Helper(Opcode);
+
+ llvm::AMDGPU::OpName OpName;
+ if (TII->isDS(Opcode)) {
+ if (MI->mayLoad())
+ OpName = llvm::AMDGPU::OpName::vdst;
+ else if (MI->mayStore())
+ OpName = llvm::AMDGPU::OpName::data0;
+ else
+ llvm_unreachable("LDS load or store expected");
+ } else {
+ OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata)
+ ? llvm::AMDGPU::OpName::vdata
+ : llvm::AMDGPU::OpName::vdst;
+ }
+
+ // select Dst/Data
+ int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName);
+ const MachineOperand &MIVDstOrVData = MI->getOperand(VDstOrVDataIdx);
+
+ // select hi/lo MCInst
+ bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI);
+ Opcode = IsHi ? Info->HiOp : Info->LoOp;
+
+ int MCOpcode = TII->pseudoToMCOpcode(Opcode);
+ assert(MCOpcode != -1 &&
+ "Pseudo instruction doesn't have a target-specific version");
+ OutMI.setOpcode(MCOpcode);
+
+ // lower operands
+ for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
+ const MachineOperand &MO = MI->getOperand(I);
+ MCOperand MCOp;
+ if (I == VDstOrVDataIdx)
+ MCOp = MCOperand::createReg(TRI.get32BitRegister(MIVDstOrVData.getReg()));
+ else
+ lowerOperand(MO, MCOp);
+ OutMI.addOperand(MCOp);
+ }
+
+ if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) {
+ MCOperand MCOp;
+ lowerOperand(MIVDstOrVData, MCOp);
+ OutMI.addOperand(MCOp);
+ }
+}
+
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+ unsigned Opcode = MI->getOpcode();
+ const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
// FIXME: Should be able to handle this with lowerPseudoInstExpansion. We
// need to select it to the subtarget specific version, and there's no way to
@@ -137,6 +191,9 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
+ } else if (AMDGPU::getT16D16Helper(Opcode)) {
+ lowerT16D16Helper(MI, OutMI);
+ return;
}
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index 7176cc5d3439b..5ddf1ca2ab06d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -39,6 +39,8 @@ class AMDGPUMCInstLower {
/// Lower a MachineInstr to an MCInst
void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const;
};
namespace {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 8fa708b74dde3..ea6e703eba5d9 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -16,6 +16,12 @@ let WantsRoot = true in {
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
}
+class True16D16Table <string hiOp, string loOp> {
+ Instruction T16Op = !cast<Instruction>(NAME);
+ Instruction HiOp = !cast<Instruction>(hiOp);
+ Instruction LoOp = !cast<Instruction>(loOp);
+}
+
//===----------------------------------------------------------------------===//
// FLAT classes
//===----------------------------------------------------------------------===//
@@ -226,6 +232,12 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
+multiclass FLAT_Load_Pseudo_t16<string opName> {
+ def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
+ let True16Predicate = UseRealTrue16Insts in
+ def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
+}
+
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
@@ -662,12 +674,12 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
let SubtargetPredicate = HasD16LoadStore in {
let TiedSourceNotRead = 1 in {
-def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>;
def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
-def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>;
+defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
-def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>;
+defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
}
def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
@@ -1049,6 +1061,11 @@ class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> :
(inst $vaddr, $offset, 0, $in)
>;
+class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset))),
+ (inst $vaddr, $offset, (i32 0))
+>;
+
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1371,16 +1388,29 @@ def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
+ def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+ def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+ def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+ def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
+ def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+ def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
+}
+
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
+ def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
+ def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
+ def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
+ def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
+
def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
@@ -2761,3 +2791,11 @@ defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_
defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;
+
+def True16D16Table : GenericTable {
+ let FilterClass = "True16D16Table";
+ let CppTypeName = "True16D16Info";
+ let Fields = ["T16Op", "HiOp", "LoOp"];
+ let PrimaryKey = ["T16Op"];
+ let PrimaryKeyName = "getT16D16Helper";
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 4fd68b52b53bb..e30e257da6873 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2483,8 +2483,15 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
// Return an AGPR+VGPR operand class for the given VGPR register class.
class getLdStRegisterOperand<RegisterClass RC> {
+ // This type of operands is only used in pseudo instructions helping
+ // code generation and thus doesn't need encoding and decoding methods.
+ // It also doesn't need to support AGPRs, because GFX908/A/40 do not
+ // support True16.
+ defvar VLdSt_16 = RegisterOperand<VGPR_16>;
+
RegisterOperand ret =
- !cond(!eq(RC.Size, 32) : AVLdSt_32,
+ !cond(!eq(RC.Size, 16) : VLdSt_16,
+ !eq(RC.Size, 32) : AVLdSt_32,
!eq(RC.Size, 64) : AVLdSt_64,
!eq(RC.Size, 96) : AVLdSt_96,
!eq(RC.Size, 128) : AVLdSt_128,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 59afcbed35294..c521d0dd3ad2d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -430,6 +430,7 @@ struct FP4FP8DstByteSelInfo {
#define GET_VOPDPairs_IMPL
#define GET_VOPTrue16Table_DECL
#define GET_VOPTrue16Table_IMPL
+#define GET_True16D16Table_IMPL
#define GET_WMMAOpcode2AddrMappingTable_DECL
#define GET_WMMAOpcode2AddrMappingTable_IMPL
#define GET_WMMAOpcode3AddrMappingTable_DECL
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index e458b6b9604b6..103993e6435de 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -113,6 +113,12 @@ struct CvtScaleF32_F32F16ToF8F4_Info {
unsigned Opcode;
};
+struct True16D16Info {
+ unsigned T16Op;
+ unsigned HiOp;
+ unsigned LoOp;
+};
+
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
@@ -123,6 +129,7 @@ struct CvtScaleF32_F32F16ToF8F4_Info {
#define GET_MAIInstInfoTable_DECL
#define GET_isMFMA_F8F6F4Table_DECL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
+#define GET_True16D16Table_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 21a2ae80574e0..db9a89a2a7370 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -3,8 +3,10 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <2 x half> @chain_hi_to_lo_private() {
; GFX900-LABEL: chain_hi_to_lo_private:
@@ -156,14 +158,23 @@ define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) {
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: chain_hi_to_lo_arithmatic:
-; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
-; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: chain_hi_to_lo_arithmatic:
+; GFX11-TRUE16: ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: chain_hi_to_lo_arithmatic:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
+; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%arith_lo = fadd half %in, 1.0
%load_hi = load half, ptr addrspace(5) %base
@@ -361,18 +372,31 @@ define <2 x half> @chain_hi_to_lo_flat() {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: chain_hi_to_lo_flat:
-; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, 2
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: flat_load_u16 v0, v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat:
+; GFX11-TRUE16: ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, ptr null, i64 1
%load_lo = load half, ptr %gep_lo
@@ -403,14 +427,23 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_h
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
-; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_u16 v0, v[0:1]
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_different_bases:
+; GFX11-TRUE16: ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_different_bases:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, ptr %base_lo
%load_hi = load half, ptr %base_hi
@@ -864,17 +897,31 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
-; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep:
+; GFX11-TRUE16: ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1
%load_lo = load volatile i16, ptr addrspace(0) %gep_lo
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 4c68b8d35260f..91f9aa1c5fe3b 100644
--- a/llvm/test/CodeGen/AMD...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/18/builds/11671 Here is the relevant piece of the build log for the reference
|
…ion" (llvm#127673) Previous patch is merged llvm#114500 and it hit a buildbot failure and thus reverted It seems the AMDGPU::OpName::OPERAND_LAST is removed at the meantime when previous patch is merged and that's causing the compile error. Fixed and reopen it here
…seudo for true16 (#127945) T16D16 table is implemented in llvm/llvm-project#127673 this is a follow up patch to add load/store pseudo for: flat_store global_load/global_store scratch_load/scratch_store in true16 mode and updated the codegen test file
Previous patch is merged #114500 and it hit a buildbot failure and thus reverted
It seems the AMDGPU::OpName::OPERAND_LAST is removed at the meantime when previous patch is merged and that's causing the compile error. Fixed and reopen it here