Skip to content

Commit 2cb5241

Browse files
committed
Revert "[AMDGPU][True16][CodeGen] FLAT_load using D16 pseudo instruction (#114500)"
This reverts commit f7a5f06. Fails to build with: llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp:126:37: error: no member named 'OPERAND_LAST' in 'llvm::AMDGPU::OpName' 126 | uint16_t OpName = AMDGPU::OpName::OPERAND_LAST;
1 parent 55dba06 commit 2cb5241

File tree

8 files changed

+54
-217
lines changed

8 files changed

+54
-217
lines changed

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 1 addition & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -114,63 +114,9 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
114114
llvm_unreachable("unknown operand type");
115115
}
116116

117-
// Lower true16 D16 Pseudo instruction to d16_lo/d16_hi MCInst based on
118-
// Dst/Data's .l/.h selection
119-
void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI,
120-
MCInst &OutMI) const {
121-
unsigned Opcode = MI->getOpcode();
122-
const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
123-
const SIRegisterInfo &TRI = TII->getRegisterInfo();
124-
const auto *Info = AMDGPU::getT16D16Helper(Opcode);
125-
126-
uint16_t OpName = AMDGPU::OpName::OPERAND_LAST;
127-
if (TII->isDS(Opcode)) {
128-
if (MI->mayLoad())
129-
OpName = llvm::AMDGPU::OpName::vdst;
130-
else if (MI->mayStore())
131-
OpName = llvm::AMDGPU::OpName::data0;
132-
else
133-
llvm_unreachable("LDS load or store expected");
134-
} else {
135-
OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata)
136-
? llvm::AMDGPU::OpName::vdata
137-
: llvm::AMDGPU::OpName::vdst;
138-
}
139-
140-
// select Dst/Data
141-
int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName);
142-
const MachineOperand &MIVDstOrVData = MI->getOperand(VDstOrVDataIdx);
143-
144-
// select hi/lo MCInst
145-
bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI);
146-
Opcode = IsHi ? Info->HiOp : Info->LoOp;
147-
148-
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
149-
assert(MCOpcode != -1 &&
150-
"Pseudo instruction doesn't have a target-specific version");
151-
OutMI.setOpcode(MCOpcode);
152-
153-
// lower operands
154-
for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
155-
const MachineOperand &MO = MI->getOperand(I);
156-
MCOperand MCOp;
157-
if (I == VDstOrVDataIdx)
158-
MCOp = MCOperand::createReg(TRI.get32BitRegister(MIVDstOrVData.getReg()));
159-
else
160-
lowerOperand(MO, MCOp);
161-
OutMI.addOperand(MCOp);
162-
}
163-
164-
if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) {
165-
MCOperand MCOp;
166-
lowerOperand(MIVDstOrVData, MCOp);
167-
OutMI.addOperand(MCOp);
168-
}
169-
}
170-
171117
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
172118
unsigned Opcode = MI->getOpcode();
173-
const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
119+
const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
174120

175121
// FIXME: Should be able to handle this with lowerPseudoInstExpansion. We
176122
// need to select it to the subtarget specific version, and there's no way to
@@ -191,9 +137,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
191137
Opcode == AMDGPU::SI_TCRETURN_GFX) {
192138
// TODO: How to use branch immediate and avoid register+add?
193139
Opcode = AMDGPU::S_SETPC_B64;
194-
} else if (AMDGPU::getT16D16Helper(Opcode)) {
195-
lowerT16D16Helper(MI, OutMI);
196-
return;
197140
}
198141

199142
int MCOpcode = TII->pseudoToMCOpcode(Opcode);

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,6 @@ class AMDGPUMCInstLower {
3939

4040
/// Lower a MachineInstr to an MCInst
4141
void lower(const MachineInstr *MI, MCInst &OutMI) const;
42-
43-
void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const;
4442
};
4543

4644
namespace {

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 7 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@ let WantsRoot = true in {
1616
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
1717
}
1818

19-
class True16D16Table <string hiOp, string loOp> {
20-
Instruction T16Op = !cast<Instruction>(NAME);
21-
Instruction HiOp = !cast<Instruction>(hiOp);
22-
Instruction LoOp = !cast<Instruction>(loOp);
23-
}
24-
2519
//===----------------------------------------------------------------------===//
2620
// FLAT classes
2721
//===----------------------------------------------------------------------===//
@@ -232,12 +226,6 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
232226
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
233227
}
234228

235-
multiclass FLAT_Load_Pseudo_t16<string opName> {
236-
def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
237-
let True16Predicate = UseRealTrue16Insts in
238-
def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
239-
}
240-
241229
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
242230
bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
243231
opName,
@@ -674,12 +662,12 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
674662

675663
let SubtargetPredicate = HasD16LoadStore in {
676664
let TiedSourceNotRead = 1 in {
665+
def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>;
677666
def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
678-
defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
667+
def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>;
679668
def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
680-
defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
669+
def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>;
681670
def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
682-
defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
683671
}
684672

685673
def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
@@ -1061,11 +1049,6 @@ class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> :
10611049
(inst $vaddr, $offset, 0, $in)
10621050
>;
10631051

1064-
class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
1065-
(vt (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset))),
1066-
(inst $vaddr, $offset, (i32 0))
1067-
>;
1068-
10691052
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
10701053
(node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
10711054
(inst $vaddr, $offset, 0, $in)
@@ -1388,29 +1371,16 @@ def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
13881371
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
13891372
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
13901373
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
1374+
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
1375+
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
1376+
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
13911377
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
13921378
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
1379+
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
13931380
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
13941381
def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
13951382
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
13961383

1397-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
1398-
let True16Predicate = p in {
1399-
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
1400-
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
1401-
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
1402-
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
1403-
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
1404-
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
1405-
}
1406-
1407-
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
1408-
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
1409-
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
1410-
def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
1411-
def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
1412-
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
1413-
14141384
def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
14151385
def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
14161386

@@ -2791,11 +2761,3 @@ defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_
27912761

27922762
defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
27932763
defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;
2794-
2795-
def True16D16Table : GenericTable {
2796-
let FilterClass = "True16D16Table";
2797-
let CppTypeName = "True16D16Info";
2798-
let Fields = ["T16Op", "HiOp", "LoOp"];
2799-
let PrimaryKey = ["T16Op"];
2800-
let PrimaryKeyName = "getT16D16Helper";
2801-
}

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2483,15 +2483,8 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
24832483

24842484
// Return an AGPR+VGPR operand class for the given VGPR register class.
24852485
class getLdStRegisterOperand<RegisterClass RC> {
2486-
// This type of operands is only used in pseudo instructions helping
2487-
// code generation and thus doesn't need encoding and decoding methods.
2488-
// It also doesn't need to support AGPRs, because GFX908/A/40 do not
2489-
// support True16.
2490-
defvar VLdSt_16 = RegisterOperand<VGPR_16>;
2491-
24922486
RegisterOperand ret =
2493-
!cond(!eq(RC.Size, 16) : VLdSt_16,
2494-
!eq(RC.Size, 32) : AVLdSt_32,
2487+
!cond(!eq(RC.Size, 32) : AVLdSt_32,
24952488
!eq(RC.Size, 64) : AVLdSt_64,
24962489
!eq(RC.Size, 96) : AVLdSt_96,
24972490
!eq(RC.Size, 128) : AVLdSt_128,

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,6 @@ struct FP4FP8DstByteSelInfo {
430430
#define GET_VOPDPairs_IMPL
431431
#define GET_VOPTrue16Table_DECL
432432
#define GET_VOPTrue16Table_IMPL
433-
#define GET_True16D16Table_IMPL
434433
#define GET_WMMAOpcode2AddrMappingTable_DECL
435434
#define GET_WMMAOpcode2AddrMappingTable_IMPL
436435
#define GET_WMMAOpcode3AddrMappingTable_DECL

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,6 @@ struct CvtScaleF32_F32F16ToF8F4_Info {
113113
unsigned Opcode;
114114
};
115115

116-
struct True16D16Info {
117-
unsigned T16Op;
118-
unsigned HiOp;
119-
unsigned LoOp;
120-
};
121-
122116
#define GET_MIMGBaseOpcode_DECL
123117
#define GET_MIMGDim_DECL
124118
#define GET_MIMGEncoding_DECL
@@ -129,7 +123,6 @@ struct True16D16Info {
129123
#define GET_MAIInstInfoTable_DECL
130124
#define GET_isMFMA_F8F6F4Table_DECL
131125
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
132-
#define GET_True16D16Table_DECL
133126
#include "AMDGPUGenSearchableTables.inc"
134127

135128
namespace IsaInfo {

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

Lines changed: 41 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
44
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
55
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
6-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
7-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
8-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
9-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s
108

119
define <2 x half> @chain_hi_to_lo_private() {
1210
; GFX900-LABEL: chain_hi_to_lo_private:
@@ -158,23 +156,14 @@ define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) {
158156
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
159157
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
160158
;
161-
; GFX11-TRUE16-LABEL: chain_hi_to_lo_arithmatic:
162-
; GFX11-TRUE16: ; %bb.0: ; %bb
163-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164-
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l
165-
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
166-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
167-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
168-
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
169-
;
170-
; GFX11-FAKE16-LABEL: chain_hi_to_lo_arithmatic:
171-
; GFX11-FAKE16: ; %bb.0: ; %bb
172-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173-
; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
174-
; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
175-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
176-
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
177-
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
159+
; GFX11-LABEL: chain_hi_to_lo_arithmatic:
160+
; GFX11: ; %bb.0: ; %bb
161+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162+
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
163+
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
164+
; GFX11-NEXT: s_waitcnt vmcnt(0)
165+
; GFX11-NEXT: v_mov_b32_e32 v0, v1
166+
; GFX11-NEXT: s_setpc_b64 s[30:31]
178167
bb:
179168
%arith_lo = fadd half %in, 1.0
180169
%load_hi = load half, ptr addrspace(5) %base
@@ -372,31 +361,18 @@ define <2 x half> @chain_hi_to_lo_flat() {
372361
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
373362
; GFX10-NEXT: s_setpc_b64 s[30:31]
374363
;
375-
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat:
376-
; GFX11-TRUE16: ; %bb.0: ; %bb
377-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
379-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
380-
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
381-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
382-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
383-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
384-
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
385-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
386-
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
387-
;
388-
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat:
389-
; GFX11-FAKE16: ; %bb.0: ; %bb
390-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391-
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
392-
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
393-
; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
394-
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
395-
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
396-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
397-
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
398-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
399-
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
364+
; GFX11-LABEL: chain_hi_to_lo_flat:
365+
; GFX11: ; %bb.0: ; %bb
366+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367+
; GFX11-NEXT: v_mov_b32_e32 v0, 2
368+
; GFX11-NEXT: v_mov_b32_e32 v1, 0
369+
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
370+
; GFX11-NEXT: v_mov_b32_e32 v1, 0
371+
; GFX11-NEXT: v_mov_b32_e32 v2, 0
372+
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
373+
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
374+
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
375+
; GFX11-NEXT: s_setpc_b64 s[30:31]
400376
bb:
401377
%gep_lo = getelementptr inbounds half, ptr null, i64 1
402378
%load_lo = load half, ptr %gep_lo
@@ -427,23 +403,14 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_h
427403
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
428404
; GFX10-NEXT: s_setpc_b64 s[30:31]
429405
;
430-
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_different_bases:
431-
; GFX11-TRUE16: ; %bb.0: ; %bb
432-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433-
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
434-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
435-
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
436-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
437-
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
438-
;
439-
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_different_bases:
440-
; GFX11-FAKE16: ; %bb.0: ; %bb
441-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442-
; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
443-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
444-
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
445-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
446-
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
406+
; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
407+
; GFX11: ; %bb.0: ; %bb
408+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409+
; GFX11-NEXT: flat_load_u16 v0, v[0:1]
410+
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
411+
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
412+
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
413+
; GFX11-NEXT: s_setpc_b64 s[30:31]
447414
bb:
448415
%load_lo = load half, ptr %base_lo
449416
%load_hi = load half, ptr %base_hi
@@ -897,31 +864,17 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
897864
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
898865
; GFX10-NEXT: s_setpc_b64 s[30:31]
899866
;
900-
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep:
901-
; GFX11-TRUE16: ; %bb.0: ; %bb
902-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
903-
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
904-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
905-
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
906-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
907-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
908-
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
909-
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
910-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
911-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
912-
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
913-
;
914-
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
915-
; GFX11-FAKE16: ; %bb.0: ; %bb
916-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
917-
; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
918-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
919-
; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
920-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
921-
; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
922-
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
923-
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
924-
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
867+
; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
868+
; GFX11: ; %bb.0: ; %bb
869+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870+
; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
871+
; GFX11-NEXT: s_waitcnt vmcnt(0)
872+
; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
873+
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
874+
; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
875+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
876+
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
877+
; GFX11-NEXT: s_setpc_b64 s[30:31]
925878
bb:
926879
%gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1
927880
%load_lo = load volatile i16, ptr addrspace(0) %gep_lo

0 commit comments

Comments
 (0)