Skip to content

Commit fc90222

Browse files
committed
AMDGPU/GlobalISel: Select llvm.amdgcn.raw.buffer.load
Use intermediate instructions, unlike with buffer stores. This is necessary because of the need to have an internal way to distinguish between signed and unsigned extloads. This introduces some duplication and near duplication with the buffer store selection path. The store handling should maybe be moved into legalization to match and eliminate the duplication.
1 parent e60d658 commit fc90222

10 files changed

+1081
-25
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,11 @@ def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>;
130130

131131
def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>;
132132
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
133+
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
134+
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
135+
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE, SIbuffer_load_ubyte>;
136+
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT, SIbuffer_load_short>;
137+
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE, SIbuffer_load_byte>;
133138

134139
// FIXME: Check MMO is atomic
135140
def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>;
@@ -238,3 +243,15 @@ def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
238243

239244
def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
240245
GISDNodeXFormEquiv<IMMPopCount>;
246+
247+
def gi_extract_glc : GICustomOperandRenderer<"renderExtractGLC">,
248+
GISDNodeXFormEquiv<extract_glc>;
249+
250+
def gi_extract_slc : GICustomOperandRenderer<"renderExtractSLC">,
251+
GISDNodeXFormEquiv<extract_slc>;
252+
253+
def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">,
254+
GISDNodeXFormEquiv<extract_dlc>;
255+
256+
def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
257+
GISDNodeXFormEquiv<extract_swz>;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2903,6 +2903,34 @@ void AMDGPUInstructionSelector::renderTruncTImm1(MachineInstrBuilder &MIB,
29032903
MIB.addImm(MI.getOperand(OpIdx).getImm());
29042904
}
29052905

2906+
void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
2907+
const MachineInstr &MI,
2908+
int OpIdx) const {
2909+
assert(OpIdx >= 0 && "expected to match an immediate operand");
2910+
MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
2911+
}
2912+
2913+
void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
2914+
const MachineInstr &MI,
2915+
int OpIdx) const {
2916+
assert(OpIdx >= 0 && "expected to match an immediate operand");
2917+
MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
2918+
}
2919+
2920+
void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
2921+
const MachineInstr &MI,
2922+
int OpIdx) const {
2923+
assert(OpIdx >= 0 && "expected to match an immediate operand");
2924+
MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
2925+
}
2926+
2927+
void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
2928+
const MachineInstr &MI,
2929+
int OpIdx) const {
2930+
assert(OpIdx >= 0 && "expected to match an immediate operand");
2931+
MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
2932+
}
2933+
29062934
bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
29072935
return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
29082936
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,14 @@ class AMDGPUInstructionSelector : public InstructionSelector {
233233

234234
void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
235235
int OpIdx) const;
236+
void renderExtractGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
237+
int OpIdx) const;
238+
void renderExtractSLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
239+
int OpIdx) const;
240+
void renderExtractDLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
241+
int OpIdx) const;
242+
void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
243+
int OpIdx) const;
236244

237245
bool isInlineImmediate16(int64_t Imm) const;
238246
bool isInlineImmediate32(int64_t Imm) const;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,15 @@
1818
#define _USE_MATH_DEFINES
1919
#endif
2020

21-
#include "AMDGPU.h"
2221
#include "AMDGPULegalizerInfo.h"
22+
23+
#include "AMDGPU.h"
24+
#include "AMDGPUGlobalISelUtils.h"
2325
#include "AMDGPUTargetMachine.h"
2426
#include "SIMachineFunctionInfo.h"
2527
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
2628
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29+
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
2730
#include "llvm/CodeGen/TargetOpcodes.h"
2831
#include "llvm/CodeGen/ValueTypes.h"
2932
#include "llvm/IR/DerivedTypes.h"
@@ -37,7 +40,7 @@ using namespace llvm;
3740
using namespace LegalizeActions;
3841
using namespace LegalizeMutations;
3942
using namespace LegalityPredicates;
40-
43+
using namespace MIPatternMatch;
4144

4245
static LegalityPredicate isMultiple32(unsigned TypeIdx,
4346
unsigned MaxSize = 1024) {
@@ -2327,6 +2330,55 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
23272330
return true;
23282331
}
23292332

2333+
// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2334+
// offset (the offset that is included in bounds checking and swizzling, to be
2335+
// split between the instruction's voffset and immoffset fields) and soffset
2336+
// (the offset that is excluded from bounds checking and swizzling, to go in
2337+
// the instruction's soffset field). This function takes the first kind of
2338+
// offset and figures out how to split it between voffset and immoffset.
2339+
std::tuple<Register, unsigned, unsigned>
2340+
AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2341+
Register OrigOffset) const {
2342+
const unsigned MaxImm = 4095;
2343+
Register BaseReg;
2344+
unsigned TotalConstOffset;
2345+
MachineInstr *OffsetDef;
2346+
const LLT S32 = LLT::scalar(32);
2347+
2348+
std::tie(BaseReg, TotalConstOffset, OffsetDef)
2349+
= AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2350+
2351+
unsigned ImmOffset = TotalConstOffset;
2352+
2353+
// If the immediate value is too big for the immoffset field, put the value
2354+
// and -4096 into the immoffset field so that the value that is copied/added
2355+
// for the voffset field is a multiple of 4096, and it stands more chance
2356+
// of being CSEd with the copy/add for another similar load/store.
2357+
// However, do not do that rounding down to a multiple of 4096 if that is a
2358+
// negative number, as it appears to be illegal to have a negative offset
2359+
// in the vgpr, even if adding the immediate offset makes it positive.
2360+
unsigned Overflow = ImmOffset & ~MaxImm;
2361+
ImmOffset -= Overflow;
2362+
if ((int32_t)Overflow < 0) {
2363+
Overflow += ImmOffset;
2364+
ImmOffset = 0;
2365+
}
2366+
2367+
if (Overflow != 0) {
2368+
if (!BaseReg) {
2369+
BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2370+
} else {
2371+
auto OverflowVal = B.buildConstant(S32, Overflow);
2372+
BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2373+
}
2374+
}
2375+
2376+
if (!BaseReg)
2377+
BaseReg = B.buildConstant(S32, 0).getReg(0);
2378+
2379+
return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2380+
}
2381+
23302382
/// Handle register layout difference for f16 images for some subtargets.
23312383
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
23322384
MachineRegisterInfo &MRI,
@@ -2383,6 +2435,72 @@ bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
23832435
return Ty == S32;
23842436
}
23852437

2438+
bool AMDGPULegalizerInfo::legalizeRawBufferLoad(MachineInstr &MI,
2439+
MachineRegisterInfo &MRI,
2440+
MachineIRBuilder &B,
2441+
bool IsFormat) const {
2442+
B.setInstr(MI);
2443+
2444+
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
2445+
MachineMemOperand *MMO = *MI.memoperands_begin();
2446+
const int MemSize = MMO->getSize();
2447+
const LLT S32 = LLT::scalar(32);
2448+
2449+
Register Dst = MI.getOperand(0).getReg();
2450+
Register RSrc = MI.getOperand(2).getReg();
2451+
Register VOffset = MI.getOperand(3).getReg();
2452+
Register SOffset = MI.getOperand(4).getReg();
2453+
unsigned AuxiliaryData = MI.getOperand(5).getImm();
2454+
unsigned ImmOffset;
2455+
unsigned TotalOffset;
2456+
2457+
std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2458+
if (TotalOffset != 0)
2459+
MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2460+
2461+
unsigned Opc;
2462+
switch (MemSize) {
2463+
case 1:
2464+
if (IsFormat)
2465+
return false;
2466+
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2467+
break;
2468+
case 2:
2469+
if (IsFormat)
2470+
return false;
2471+
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2472+
break;
2473+
default:
2474+
Opc = IsFormat ? -1/*TODO*/ : AMDGPU::G_AMDGPU_BUFFER_LOAD;
2475+
break;
2476+
}
2477+
2478+
Register LoadDstReg = MemSize >= 4 ? Dst :
2479+
B.getMRI()->createGenericVirtualRegister(S32);
2480+
2481+
Register VIndex = B.buildConstant(S32, 0).getReg(0);
2482+
2483+
B.buildInstr(Opc)
2484+
.addDef(LoadDstReg) // vdata
2485+
.addUse(RSrc) // rsrc
2486+
.addUse(VIndex) // vindex
2487+
.addUse(VOffset) // voffset
2488+
.addUse(SOffset) // soffset
2489+
.addImm(ImmOffset) // offset(imm)
2490+
.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
2491+
.addImm(0) // idxen(imm)
2492+
.addMemOperand(MMO);
2493+
2494+
if (LoadDstReg != Dst) {
2495+
// Widen result for extending loads was widened.
2496+
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2497+
B.buildTrunc(Dst, LoadDstReg);
2498+
}
2499+
2500+
MI.eraseFromParent();
2501+
return true;
2502+
}
2503+
23862504
bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
23872505
MachineIRBuilder &B,
23882506
bool IsInc) const {
@@ -2517,6 +2635,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
25172635
return legalizeRawBufferStore(MI, MRI, B, false);
25182636
case Intrinsic::amdgcn_raw_buffer_store_format:
25192637
return legalizeRawBufferStore(MI, MRI, B, true);
2638+
case Intrinsic::amdgcn_raw_buffer_load:
2639+
return legalizeRawBufferLoad(MI, MRI, B, false);
25202640
case Intrinsic::amdgcn_atomic_inc:
25212641
return legalizeAtomicIncDec(MI, B, true);
25222642
case Intrinsic::amdgcn_atomic_dec:

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,15 @@ class AMDGPULegalizerInfo : public LegalizerInfo {
105105
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
106106
MachineIRBuilder &B, unsigned AddrSpace) const;
107107

108+
std::tuple<Register, unsigned, unsigned>
109+
splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
110+
108111
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
109112
Register Reg) const;
110113
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
111114
MachineIRBuilder &B, bool IsFormat) const;
112-
115+
bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
116+
MachineIRBuilder &B, bool IsFormat) const;
113117
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
114118
bool IsInc) const;
115119

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2243,6 +2243,15 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
22432243

22442244
return;
22452245
}
2246+
case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2247+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2248+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2249+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2250+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: {
2251+
applyDefaultMapping(OpdMapper);
2252+
executeInWaterfallLoop(MI, MRI, {1, 4});
2253+
return;
2254+
}
22462255
case AMDGPU::G_INTRINSIC: {
22472256
switch (MI.getIntrinsicID()) {
22482257
case Intrinsic::amdgcn_s_buffer_load: {
@@ -2325,9 +2334,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
23252334
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
23262335
return;
23272336
}
2328-
case Intrinsic::amdgcn_raw_buffer_load:
2329-
case Intrinsic::amdgcn_raw_buffer_load_format:
2330-
case Intrinsic::amdgcn_raw_tbuffer_load:
23312337
case Intrinsic::amdgcn_raw_buffer_store:
23322338
case Intrinsic::amdgcn_raw_buffer_store_format:
23332339
case Intrinsic::amdgcn_raw_tbuffer_store: {
@@ -3061,6 +3067,26 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
30613067
}
30623068
break;
30633069
}
3070+
case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3071+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3072+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3073+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3074+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: {
3075+
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3076+
3077+
// rsrc
3078+
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3079+
3080+
// vindex
3081+
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3082+
3083+
// voffset
3084+
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3085+
3086+
// soffset
3087+
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3088+
break;
3089+
}
30643090
case AMDGPU::G_INTRINSIC: {
30653091
switch (MI.getIntrinsicID()) {
30663092
default:

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,19 +1152,19 @@ let SubtargetPredicate = isGFX10Plus in {
11521152
// MUBUF Patterns
11531153
//===----------------------------------------------------------------------===//
11541154

1155-
def extract_glc : SDNodeXForm<imm, [{
1155+
def extract_glc : SDNodeXForm<timm, [{
11561156
return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
11571157
}]>;
11581158

1159-
def extract_slc : SDNodeXForm<imm, [{
1159+
def extract_slc : SDNodeXForm<timm, [{
11601160
return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
11611161
}]>;
11621162

1163-
def extract_dlc : SDNodeXForm<imm, [{
1163+
def extract_dlc : SDNodeXForm<timm, [{
11641164
return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
11651165
}]>;
11661166

1167-
def extract_swz : SDNodeXForm<imm, [{
1167+
def extract_swz : SDNodeXForm<timm, [{
11681168
return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
11691169
}]>;
11701170

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2146,6 +2146,21 @@ def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
21462146
let hasSideEffects = 0;
21472147
}
21482148

2149+
class BufferLoadGenericInstruction : AMDGPUGenericInstruction {
2150+
let OutOperandList = (outs type0:$dst);
2151+
let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
2152+
type2:$soffset, untyped_imm_0:$offset,
2153+
untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
2154+
let hasSideEffects = 0;
2155+
let mayLoad = 1;
2156+
}
2157+
2158+
def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction;
2159+
def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
2160+
def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
2161+
def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
2162+
def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
2163+
21492164
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
21502165
// operand Expects a MachineMemOperand in addition to explicit
21512166
// operands.

0 commit comments

Comments
 (0)