Skip to content

[AMDGPU][True16][CodeGen] FLAT_load using D16 pseudo instruction #114500

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 58 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,63 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
llvm_unreachable("unknown operand type");
}

void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
// Lower true16 D16 Pseudo instruction to d16_lo/d16_hi MCInst based on
// Dst/Data's .l/.h selection
void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI,
MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
const SIRegisterInfo &TRI = TII->getRegisterInfo();
const auto *Info = AMDGPU::getT16D16Helper(Opcode);

uint16_t OpName = AMDGPU::OpName::OPERAND_LAST;
if (TII->isDS(Opcode)) {
if (MI->mayLoad())
OpName = llvm::AMDGPU::OpName::vdst;
else if (MI->mayStore())
OpName = llvm::AMDGPU::OpName::data0;
else
llvm_unreachable("LDS load or store expected");
} else {
OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata)
? llvm::AMDGPU::OpName::vdata
: llvm::AMDGPU::OpName::vdst;
}

// select Dst/Data
int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName);
const MachineOperand &MIVDstOrVData = MI->getOperand(VDstOrVDataIdx);

// select hi/lo MCInst
bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI);
Opcode = IsHi ? Info->HiOp : Info->LoOp;

int MCOpcode = TII->pseudoToMCOpcode(Opcode);
assert(MCOpcode != -1 &&
"Pseudo instruction doesn't have a target-specific version");
OutMI.setOpcode(MCOpcode);

// lower operands
for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
const MachineOperand &MO = MI->getOperand(I);
MCOperand MCOp;
if (I == VDstOrVDataIdx)
MCOp = MCOperand::createReg(TRI.get32BitRegister(MIVDstOrVData.getReg()));
else
lowerOperand(MO, MCOp);
OutMI.addOperand(MCOp);
}

if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) {
MCOperand MCOp;
lowerOperand(MIVDstOrVData, MCOp);
OutMI.addOperand(MCOp);
}
}

void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());

// FIXME: Should be able to handle this with lowerPseudoInstExpansion. We
// need to select it to the subtarget specific version, and there's no way to
Expand All @@ -136,6 +190,9 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
} else if (AMDGPU::getT16D16Helper(Opcode)) {
lowerT16D16Helper(MI, OutMI);
return;
}

int MCOpcode = TII->pseudoToMCOpcode(Opcode);
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class AMDGPUMCInstLower {

/// Lower a MachineInstr to an MCInst
void lower(const MachineInstr *MI, MCInst &OutMI) const;

void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const;
};

namespace {
Expand Down
52 changes: 45 additions & 7 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ let WantsRoot = true in {
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
}

class True16D16Table <string hiOp, string loOp> {
Instruction T16Op = !cast<Instruction>(NAME);
Instruction HiOp = !cast<Instruction>(hiOp);
Instruction LoOp = !cast<Instruction>(loOp);
}

//===----------------------------------------------------------------------===//
// FLAT classes
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -226,6 +232,12 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}

multiclass FLAT_Load_Pseudo_t16<string opName> {
def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
let True16Predicate = UseRealTrue16Insts in
def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
}

class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
Expand Down Expand Up @@ -662,12 +674,12 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;

let SubtargetPredicate = HasD16LoadStore in {
let TiedSourceNotRead = 1 in {
def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>;
def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>;
defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>;
defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
}

def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
Expand Down Expand Up @@ -1049,6 +1061,11 @@ class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> :
(inst $vaddr, $offset, 0, $in)
>;

class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset))),
(inst $vaddr, $offset, (i32 0))
>;

class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
Expand Down Expand Up @@ -1371,16 +1388,29 @@ def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;

foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in {
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
}

let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts

def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;

Expand Down Expand Up @@ -2761,3 +2791,11 @@ defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_

defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;

def True16D16Table : GenericTable {
let FilterClass = "True16D16Table";
let CppTypeName = "True16D16Info";
let Fields = ["T16Op", "HiOp", "LoOp"];
let PrimaryKey = ["T16Op"];
let PrimaryKeyName = "getT16D16Helper";
}
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2483,8 +2483,15 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,

// Return an AGPR+VGPR operand class for the given VGPR register class.
class getLdStRegisterOperand<RegisterClass RC> {
// This type of operands is only used in pseudo instructions helping
// code generation and thus doesn't need encoding and decoding methods.
// It also doesn't need to support AGPRs, because GFX908/A/40 do not
// support True16.
defvar VLdSt_16 = RegisterOperand<VGPR_16>;

RegisterOperand ret =
!cond(!eq(RC.Size, 32) : AVLdSt_32,
!cond(!eq(RC.Size, 16) : VLdSt_16,
!eq(RC.Size, 32) : AVLdSt_32,
!eq(RC.Size, 64) : AVLdSt_64,
!eq(RC.Size, 96) : AVLdSt_96,
!eq(RC.Size, 128) : AVLdSt_128,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ struct FP4FP8DstByteSelInfo {
#define GET_VOPDPairs_IMPL
#define GET_VOPTrue16Table_DECL
#define GET_VOPTrue16Table_IMPL
#define GET_True16D16Table_IMPL
#define GET_WMMAOpcode2AddrMappingTable_DECL
#define GET_WMMAOpcode2AddrMappingTable_IMPL
#define GET_WMMAOpcode3AddrMappingTable_DECL
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ struct CvtScaleF32_F32F16ToF8F4_Info {
unsigned Opcode;
};

struct True16D16Info {
unsigned T16Op;
unsigned HiOp;
unsigned LoOp;
};

#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
Expand All @@ -119,6 +125,7 @@ struct CvtScaleF32_F32F16ToF8F4_Info {
#define GET_MAIInstInfoTable_DECL
#define GET_isMFMA_F8F6F4Table_DECL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
#define GET_True16D16Table_DECL
#include "AMDGPUGenSearchableTables.inc"

namespace IsaInfo {
Expand Down
129 changes: 88 additions & 41 deletions llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s

define <2 x half> @chain_hi_to_lo_private() {
; GFX900-LABEL: chain_hi_to_lo_private:
Expand Down Expand Up @@ -156,14 +158,23 @@ define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) {
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_arithmatic:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: chain_hi_to_lo_arithmatic:
; GFX11-TRUE16: ; %bb.0: ; %bb
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_arithmatic:
; GFX11-FAKE16: ; %bb.0: ; %bb
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%arith_lo = fadd half %in, 1.0
%load_hi = load half, ptr addrspace(5) %base
Expand Down Expand Up @@ -361,18 +372,31 @@ define <2 x half> @chain_hi_to_lo_flat() {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_flat:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, 2
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat:
; GFX11-TRUE16: ; %bb.0: ; %bb
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat:
; GFX11-FAKE16: ; %bb.0: ; %bb
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, ptr null, i64 1
%load_lo = load half, ptr %gep_lo
Expand Down Expand Up @@ -403,14 +427,23 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_h
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_different_bases:
; GFX11-TRUE16: ; %bb.0: ; %bb
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_different_bases:
; GFX11-FAKE16: ; %bb.0: ; %bb
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, ptr %base_lo
%load_hi = load half, ptr %base_hi
Expand Down Expand Up @@ -864,17 +897,31 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep:
; GFX11-TRUE16: ; %bb.0: ; %bb
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
; GFX11-FAKE16: ; %bb.0: ; %bb
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1
%load_lo = load volatile i16, ptr addrspace(0) %gep_lo
Expand Down
Loading