Skip to content

Commit 18b2180

Browse files
committed
AMDGPU/GlobalISel: Legalize odd sized loads with widening
Custom lower and widen odd sized loads up to the alignment. The default set of legalization actions doesn't have a way to represent this. This fixes naturally aligned <3 x s8> and <3 x s16> loads. This also starts moving towards eliminating the buggy and overcomplicated legalization rules for narrowing. All the memory size changes should be done in the lower or custom action, not NarrowScalar / FewerElements. These currently have redundant and ambiguous code with the lower action.
1 parent 54d8ded commit 18b2180

9 files changed

+1689
-775
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 125 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,52 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360360
!isRegisterVectorElementType(Ty.getElementType());
361361
}
362362

363+
/// Return true if we should legalize a load by widening an odd sized memory
364+
/// access up to the alignment. Note this case when the memory access itself
365+
/// changes, not the size of the result register.
366+
static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
367+
unsigned AlignInBits, unsigned AddrSpace,
368+
unsigned Opcode) {
369+
// We don't want to widen cases that are naturally legal.
370+
if (isPowerOf2_32(SizeInBits))
371+
return false;
372+
373+
// If we have 96-bit memory operations, we shouldn't touch them. Note we may
374+
// end up widening these for a scalar load during RegBankSelect, since there
375+
// aren't 96-bit scalar loads.
376+
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
377+
return false;
378+
379+
if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
380+
return false;
381+
382+
// A load is known dereferenceable up to the alignment, so it's legal to widen
383+
// to it.
384+
//
385+
// TODO: Could check dereferenceable for less aligned cases.
386+
unsigned RoundedSize = NextPowerOf2(SizeInBits);
387+
if (AlignInBits < RoundedSize)
388+
return false;
389+
390+
// Do not widen if it would introduce a slow unaligned load.
391+
const SITargetLowering *TLI = ST.getTargetLowering();
392+
bool Fast = false;
393+
return TLI->allowsMisalignedMemoryAccessesImpl(
394+
RoundedSize, AddrSpace, Align(AlignInBits / 8),
395+
MachineMemOperand::MOLoad, &Fast) &&
396+
Fast;
397+
}
398+
399+
static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
400+
unsigned Opcode) {
401+
if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
402+
return false;
403+
404+
return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
405+
Query.MMODescrs[0].AlignInBits,
406+
Query.Types[1].getAddressSpace(), Opcode);
407+
}
408+
363409
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
364410
const GCNTargetMachine &TM)
365411
: ST(ST_) {
@@ -1005,24 +1051,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
10051051
return false;
10061052
};
10071053

1008-
const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
1009-
unsigned Opc) -> bool {
1010-
unsigned Size = Query.Types[0].getSizeInBits();
1011-
if (isPowerOf2_32(Size))
1012-
return false;
1013-
1014-
if (Size == 96 && ST.hasDwordx3LoadStores())
1015-
return false;
1016-
1017-
unsigned AddrSpace = Query.Types[1].getAddressSpace();
1018-
if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
1019-
return false;
1020-
1021-
unsigned Align = Query.MMODescrs[0].AlignInBits;
1022-
unsigned RoundedSize = NextPowerOf2(Size);
1023-
return (Align >= RoundedSize);
1024-
};
1025-
10261054
unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
10271055
unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
10281056
unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
@@ -1087,19 +1115,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
10871115
Query.MMODescrs[0].SizeInBits);
10881116
}, bitcastToRegisterType(0));
10891117

1118+
if (!IsStore) {
1119+
// Widen suitably aligned loads by loading extra bytes. The standard
1120+
// legalization actions can't properly express widening memory operands.
1121+
Actions.customIf([=](const LegalityQuery &Query) -> bool {
1122+
return shouldWidenLoad(ST, Query, G_LOAD);
1123+
});
1124+
}
1125+
1126+
// FIXME: load/store narrowing should be moved to lower action
10901127
Actions
1091-
.customIf(typeIs(1, Constant32Ptr))
1092-
// Widen suitably aligned loads by loading extra elements.
1093-
.moreElementsIf([=](const LegalityQuery &Query) {
1094-
const LLT Ty = Query.Types[0];
1095-
return Op == G_LOAD && Ty.isVector() &&
1096-
shouldWidenLoadResult(Query, Op);
1097-
}, moreElementsToNextPow2(0))
1098-
.widenScalarIf([=](const LegalityQuery &Query) {
1099-
const LLT Ty = Query.Types[0];
1100-
return Op == G_LOAD && !Ty.isVector() &&
1101-
shouldWidenLoadResult(Query, Op);
1102-
}, widenScalarOrEltToNextPow2(0))
11031128
.narrowScalarIf(
11041129
[=](const LegalityQuery &Query) -> bool {
11051130
return !Query.Types[0].isVector() &&
@@ -1205,15 +1230,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
12051230
// May need relegalization for the scalars.
12061231
return std::make_pair(0, EltTy);
12071232
})
1208-
.minScalar(0, S32);
1233+
.lowerIfMemSizeNotPow2()
1234+
.minScalar(0, S32);
12091235

12101236
if (IsStore)
12111237
Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
12121238

1213-
// TODO: Need a bitcast lower option?
12141239
Actions
12151240
.widenScalarToNextPow2(0)
1216-
.moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1241+
.moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1242+
.lower();
12171243
}
12181244

12191245
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
@@ -2303,6 +2329,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
23032329
return true;
23042330
}
23052331

2332+
static LLT widenToNextPowerOf2(LLT Ty) {
2333+
if (Ty.isVector())
2334+
return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2335+
return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2336+
}
2337+
23062338
bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
23072339
MachineInstr &MI) const {
23082340
MachineIRBuilder &B = Helper.MIRBuilder;
@@ -2322,6 +2354,66 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
23222354
return true;
23232355
}
23242356

2357+
Register ValReg = MI.getOperand(0).getReg();
2358+
LLT ValTy = MRI.getType(ValReg);
2359+
2360+
MachineMemOperand *MMO = *MI.memoperands_begin();
2361+
const unsigned ValSize = ValTy.getSizeInBits();
2362+
const unsigned MemSize = 8 * MMO->getSize();
2363+
const Align MemAlign = MMO->getAlign();
2364+
const unsigned AlignInBits = 8 * MemAlign.value();
2365+
2366+
// Widen non-power-of-2 loads to the alignment if needed
2367+
if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2368+
const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2369+
2370+
// This was already the correct extending load result type, so just adjust
2371+
// the memory type.
2372+
if (WideMemSize == ValSize) {
2373+
MachineFunction &MF = B.getMF();
2374+
2375+
// FIXME: This is losing AA metadata
2376+
MachineMemOperand *WideMMO =
2377+
MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2378+
Observer.changingInstr(MI);
2379+
MI.setMemRefs(MF, {WideMMO});
2380+
Observer.changedInstr(MI);
2381+
return true;
2382+
}
2383+
2384+
// Don't bother handling edge case that should probably never be produced.
2385+
if (ValSize > WideMemSize)
2386+
return false;
2387+
2388+
LLT WideTy = widenToNextPowerOf2(ValTy);
2389+
2390+
// FIXME: This is losing AA metadata
2391+
Register WideLoad;
2392+
if (!WideTy.isVector()) {
2393+
WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2394+
B.buildTrunc(ValReg, WideLoad).getReg(0);
2395+
} else {
2396+
// Extract the subvector.
2397+
2398+
if (isRegisterType(ValTy)) {
2399+
// If this a case where G_EXTRACT is legal, use it.
2400+
// (e.g. <3 x s32> -> <4 x s32>)
2401+
WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2402+
B.buildExtract(ValReg, WideLoad, 0);
2403+
} else {
2404+
// For cases where the widened type isn't a nice register value, unmerge
2405+
// from a widened register (e.g. <3 x s16> -> <4 x s16>)
2406+
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2407+
WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2408+
B.setInsertPt(B.getMBB(), MI.getIterator());
2409+
B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2410+
}
2411+
}
2412+
2413+
MI.eraseFromParent();
2414+
return true;
2415+
}
2416+
23252417
return false;
23262418
}
23272419

0 commit comments

Comments
 (0)