Skip to content

Commit 24013a1

Browse files
committed
[AMDGPU][True16][MC] FLAT load/store supporting True16 format
1 parent c60b055 commit 24013a1

File tree

7 files changed

+187
-53
lines changed

7 files changed

+187
-53
lines changed

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
116116
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
117117
unsigned Opcode = MI->getOpcode();
118118
const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
119+
const SIRegisterInfo &TRI = TII->getRegisterInfo();
119120

120121
// FIXME: Should be able to handle this with lowerPseudoInstExpansion. We
121122
// need to select it to the subtarget specific version, and there's no way to
@@ -132,6 +133,35 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
132133
OutMI.addOperand(Dest);
133134
OutMI.addOperand(Src);
134135
return;
136+
} else if (const auto *Info = AMDGPU::getT16D16Helper(Opcode)) {
137+
int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
138+
int VDstOrVDataIdx = VDataIdx != -1 ? VDataIdx : 0;
139+
MachineOperand MIVDstOrVData = MI->getOperand(VDstOrVDataIdx);
140+
bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI);
141+
Opcode = IsHi ? Info->HiOp : Info->LoOp;
142+
MIVDstOrVData.clearParent(); // Avoid use list error in setReg call
143+
MIVDstOrVData.setReg(TRI.get32BitRegister(MIVDstOrVData.getReg()));
144+
145+
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
146+
assert(MCOpcode != -1 &&
147+
"Pseudo instruction doesn't have a target-specific version");
148+
OutMI.setOpcode(MCOpcode);
149+
for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
150+
const MachineOperand &MO = MI->getOperand(I);
151+
MCOperand MCOp;
152+
if (I == VDstOrVDataIdx)
153+
lowerOperand(MIVDstOrVData, MCOp);
154+
else
155+
lowerOperand(MO, MCOp);
156+
OutMI.addOperand(MCOp);
157+
}
158+
159+
if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) {
160+
MCOperand MCOp;
161+
lowerOperand(MIVDstOrVData, MCOp);
162+
OutMI.addOperand(MCOp);
163+
}
164+
return;
135165
} else if (Opcode == AMDGPU::SI_TCRETURN ||
136166
Opcode == AMDGPU::SI_TCRETURN_GFX) {
137167
// TODO: How to use branch immediate and avoid register+add?

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@ def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot
1414
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
1515
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [SDNPWantRoot], -10>;
1616

17+
class True16D16Table <string hiOp, string loOp> {
18+
Instruction T16Op = !cast<Instruction>(NAME);
19+
Instruction HiOp = !cast<Instruction>(hiOp);
20+
Instruction LoOp = !cast<Instruction>(loOp);
21+
}
22+
1723
//===----------------------------------------------------------------------===//
1824
// FLAT classes
1925
//===----------------------------------------------------------------------===//
@@ -225,6 +231,12 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
225231
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
226232
}
227233

234+
multiclass FLAT_Load_Pseudo_t16<string opName> {
235+
def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
236+
let True16Predicate = UseRealTrue16Insts in
237+
def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
238+
}
239+
228240
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
229241
bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
230242
opName,
@@ -661,12 +673,12 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
661673

662674
let SubtargetPredicate = HasD16LoadStore in {
663675
let TiedSourceNotRead = 1 in {
664-
def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>;
665676
def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
666-
def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>;
677+
defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
667678
def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
668-
def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>;
679+
defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
669680
def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
681+
defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
670682
}
671683

672684
def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
@@ -1048,6 +1060,11 @@ class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> :
10481060
(inst $vaddr, $offset, 0, $in)
10491061
>;
10501062

1063+
class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
1064+
(vt (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset))),
1065+
(inst $vaddr, $offset, (i32 0))
1066+
>;
1067+
10511068
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
10521069
(node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
10531070
(inst $vaddr, $offset, 0, $in)
@@ -1370,16 +1387,29 @@ def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
13701387
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
13711388
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
13721389
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
1373-
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
1374-
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
1375-
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
13761390
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
13771391
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
1378-
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
13791392
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
13801393
def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
13811394
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
13821395

1396+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
1397+
let True16Predicate = p in {
1398+
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
1399+
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
1400+
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
1401+
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
1402+
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
1403+
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
1404+
}
1405+
1406+
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
1407+
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
1408+
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
1409+
def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
1410+
def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
1411+
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
1412+
13831413
def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
13841414
def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
13851415

@@ -2760,3 +2790,11 @@ defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_
27602790

27612791
defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
27622792
defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;
2793+
2794+
def True16D16Table : GenericTable {
2795+
let FilterClass = "True16D16Table";
2796+
let CppTypeName = "True16D16Info";
2797+
let Fields = ["T16Op", "HiOp", "LoOp"];
2798+
let PrimaryKey = ["T16Op"];
2799+
let PrimaryKeyName = "getT16D16Helper";
2800+
}

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2482,8 +2482,15 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
24822482

24832483
// Return an AGPR+VGPR operand class for the given VGPR register class.
24842484
class getLdStRegisterOperand<RegisterClass RC> {
2485+
// This type of operands is only used in pseudo instructions helping
2486+
// code generation and thus doesn't need encoding and decoding methods.
2487+
// It also doesn't need to support AGPRs, because GFX908/A/40 do not
2488+
// support True16.
2489+
defvar VLdSt_16 = RegisterOperand<VGPR_16>;
2490+
24852491
RegisterOperand ret =
2486-
!cond(!eq(RC.Size, 32) : AVLdSt_32,
2492+
!cond(!eq(RC.Size, 16) : VLdSt_16,
2493+
!eq(RC.Size, 32) : AVLdSt_32,
24872494
!eq(RC.Size, 64) : AVLdSt_64,
24882495
!eq(RC.Size, 96) : AVLdSt_96,
24892496
!eq(RC.Size, 128) : AVLdSt_128,

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ struct FP8DstByteSelInfo {
411411
#define GET_VOPDPairs_IMPL
412412
#define GET_VOPTrue16Table_DECL
413413
#define GET_VOPTrue16Table_IMPL
414+
#define GET_True16D16Table_IMPL
414415
#define GET_WMMAOpcode2AddrMappingTable_DECL
415416
#define GET_WMMAOpcode2AddrMappingTable_IMPL
416417
#define GET_WMMAOpcode3AddrMappingTable_DECL

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,12 @@ struct MFMA_F8F6F4_Info {
103103
uint8_t NumRegsSrcB;
104104
};
105105

106+
struct True16D16Info {
107+
unsigned T16Op;
108+
unsigned HiOp;
109+
unsigned LoOp;
110+
};
111+
106112
#define GET_MIMGBaseOpcode_DECL
107113
#define GET_MIMGDim_DECL
108114
#define GET_MIMGEncoding_DECL
@@ -112,6 +118,7 @@ struct MFMA_F8F6F4_Info {
112118
#define GET_MAIInstInfoTable_DECL
113119
#define GET_MAIInstInfoTable_DECL
114120
#define GET_isMFMA_F8F6F4Table_DECL
121+
#define GET_True16D16Table_DECL
115122
#include "AMDGPUGenSearchableTables.inc"
116123

117124
namespace IsaInfo {

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
44
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
55
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
6-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s
7-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
8+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
9+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
810

911
define <2 x half> @chain_hi_to_lo_private() {
1012
; GFX900-LABEL: chain_hi_to_lo_private:
@@ -156,14 +158,23 @@ define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) {
156158
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
157159
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
158160
;
159-
; GFX11-LABEL: chain_hi_to_lo_arithmatic:
160-
; GFX11: ; %bb.0: ; %bb
161-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162-
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
163-
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
164-
; GFX11-NEXT: s_waitcnt vmcnt(0)
165-
; GFX11-NEXT: v_mov_b32_e32 v0, v1
166-
; GFX11-NEXT: s_setpc_b64 s[30:31]
161+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_arithmatic:
162+
; GFX11-TRUE16: ; %bb.0: ; %bb
163+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164+
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l
165+
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
166+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
167+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
168+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
169+
;
170+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_arithmatic:
171+
; GFX11-FAKE16: ; %bb.0: ; %bb
172+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173+
; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
174+
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
175+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
176+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
177+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
167178
bb:
168179
%arith_lo = fadd half %in, 1.0
169180
%load_hi = load half, ptr addrspace(5) %base
@@ -361,18 +372,31 @@ define <2 x half> @chain_hi_to_lo_flat() {
361372
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
362373
; GFX10-NEXT: s_setpc_b64 s[30:31]
363374
;
364-
; GFX11-LABEL: chain_hi_to_lo_flat:
365-
; GFX11: ; %bb.0: ; %bb
366-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367-
; GFX11-NEXT: v_mov_b32_e32 v0, 2
368-
; GFX11-NEXT: v_mov_b32_e32 v1, 0
369-
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
370-
; GFX11-NEXT: v_mov_b32_e32 v1, 0
371-
; GFX11-NEXT: v_mov_b32_e32 v2, 0
372-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
373-
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
374-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
375-
; GFX11-NEXT: s_setpc_b64 s[30:31]
375+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat:
376+
; GFX11-TRUE16: ; %bb.0: ; %bb
377+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
379+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
380+
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
381+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
382+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
383+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
384+
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
385+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
386+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
387+
;
388+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat:
389+
; GFX11-FAKE16: ; %bb.0: ; %bb
390+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
392+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
393+
; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
394+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
395+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
396+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
397+
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
398+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
399+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
376400
bb:
377401
%gep_lo = getelementptr inbounds half, ptr null, i64 1
378402
%load_lo = load half, ptr %gep_lo
@@ -403,14 +427,23 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_h
403427
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
404428
; GFX10-NEXT: s_setpc_b64 s[30:31]
405429
;
406-
; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
407-
; GFX11: ; %bb.0: ; %bb
408-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409-
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
410-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
411-
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
412-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
413-
; GFX11-NEXT: s_setpc_b64 s[30:31]
430+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_different_bases:
431+
; GFX11-TRUE16: ; %bb.0: ; %bb
432+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433+
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
434+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
435+
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
436+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
437+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
438+
;
439+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_different_bases:
440+
; GFX11-FAKE16: ; %bb.0: ; %bb
441+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442+
; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
443+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
444+
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
445+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
446+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
414447
bb:
415448
%load_lo = load half, ptr %base_lo
416449
%load_hi = load half, ptr %base_hi
@@ -864,17 +897,31 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
864897
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
865898
; GFX10-NEXT: s_setpc_b64 s[30:31]
866899
;
867-
; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
868-
; GFX11: ; %bb.0: ; %bb
869-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870-
; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
871-
; GFX11-NEXT: s_waitcnt vmcnt(0)
872-
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
873-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
874-
; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
875-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
876-
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
877-
; GFX11-NEXT: s_setpc_b64 s[30:31]
900+
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep:
901+
; GFX11-TRUE16: ; %bb.0: ; %bb
902+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
903+
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
904+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
905+
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
906+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
907+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
908+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
909+
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
910+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
911+
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
912+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
913+
;
914+
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
915+
; GFX11-FAKE16: ; %bb.0: ; %bb
916+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
917+
; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
918+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
919+
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
920+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
921+
; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
922+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
923+
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
924+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
878925
bb:
879926
%gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1
880927
%load_lo = load volatile i16, ptr addrspace(0) %gep_lo

llvm/test/CodeGen/AMDGPU/flat-address-space.ll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIVI %s
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,CIVI,CIVI-HSA %s
44
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS %s
6-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11 %s
5+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10PLUS %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11-TRUE16 %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11-FAKE16 %s
78

89
; GCN-LABEL: {{^}}store_flat_i32:
910
; GCN-DAG: s_load_{{dwordx2|b64}} s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
@@ -224,7 +225,8 @@ define amdgpu_kernel void @store_flat_i8_neg_offset(ptr %fptr, i8 %x) #0 {
224225
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
225226
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}}
226227
; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
227-
; GFX11: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
228+
; GFX11-TRUE16: flat_load_d16_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
229+
; GFX11-FAKE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
228230
define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 {
229231
%fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095
230232
%val = load volatile i8, ptr %fptr.offset
@@ -234,7 +236,9 @@ define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 {
234236
; GCN-LABEL: {{^}}load_flat_i8_max_offset_p1:
235237
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
236238
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
237-
; GFX10PLUS: flat_load_{{ubyte|u8}} v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
239+
; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
240+
; GFX11-TRUE16: flat_load_d16_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
241+
; GFX11-FAKE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
238242
define amdgpu_kernel void @load_flat_i8_max_offset_p1(ptr %fptr) #0 {
239243
%fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096
240244
%val = load volatile i8, ptr %fptr.offset

0 commit comments

Comments
 (0)