@@ -360,6 +360,52 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360
360
!isRegisterVectorElementType (Ty.getElementType ());
361
361
}
362
362
363
+ // / Return true if we should legalize a load by widening an odd sized memory
364
+ // / access up to the alignment. Note this case when the memory access itself
365
+ // / changes, not the size of the result register.
366
+ static bool shouldWidenLoad (const GCNSubtarget &ST, unsigned SizeInBits,
367
+ unsigned AlignInBits, unsigned AddrSpace,
368
+ unsigned Opcode) {
369
+ // We don't want to widen cases that are naturally legal.
370
+ if (isPowerOf2_32 (SizeInBits))
371
+ return false ;
372
+
373
+ // If we have 96-bit memory operations, we shouldn't touch them. Note we may
374
+ // end up widening these for a scalar load during RegBankSelect, since there
375
+ // aren't 96-bit scalar loads.
376
+ if (SizeInBits == 96 && ST.hasDwordx3LoadStores ())
377
+ return false ;
378
+
379
+ if (SizeInBits >= maxSizeForAddrSpace (ST, AddrSpace, Opcode))
380
+ return false ;
381
+
382
+ // A load is known dereferenceable up to the alignment, so it's legal to widen
383
+ // to it.
384
+ //
385
+ // TODO: Could check dereferenceable for less aligned cases.
386
+ unsigned RoundedSize = NextPowerOf2 (SizeInBits);
387
+ if (AlignInBits < RoundedSize)
388
+ return false ;
389
+
390
+ // Do not widen if it would introduce a slow unaligned load.
391
+ const SITargetLowering *TLI = ST.getTargetLowering ();
392
+ bool Fast = false ;
393
+ return TLI->allowsMisalignedMemoryAccessesImpl (
394
+ RoundedSize, AddrSpace, Align (AlignInBits / 8 ),
395
+ MachineMemOperand::MOLoad, &Fast) &&
396
+ Fast;
397
+ }
398
+
399
+ static bool shouldWidenLoad (const GCNSubtarget &ST, const LegalityQuery &Query,
400
+ unsigned Opcode) {
401
+ if (Query.MMODescrs [0 ].Ordering != AtomicOrdering::NotAtomic)
402
+ return false ;
403
+
404
+ return shouldWidenLoad (ST, Query.MMODescrs [0 ].SizeInBits ,
405
+ Query.MMODescrs [0 ].AlignInBits ,
406
+ Query.Types [1 ].getAddressSpace (), Opcode);
407
+ }
408
+
363
409
AMDGPULegalizerInfo::AMDGPULegalizerInfo (const GCNSubtarget &ST_,
364
410
const GCNTargetMachine &TM)
365
411
: ST(ST_) {
@@ -1005,24 +1051,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
1005
1051
return false ;
1006
1052
};
1007
1053
1008
- const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
1009
- unsigned Opc) -> bool {
1010
- unsigned Size = Query.Types [0 ].getSizeInBits ();
1011
- if (isPowerOf2_32 (Size))
1012
- return false ;
1013
-
1014
- if (Size == 96 && ST.hasDwordx3LoadStores ())
1015
- return false ;
1016
-
1017
- unsigned AddrSpace = Query.Types [1 ].getAddressSpace ();
1018
- if (Size >= maxSizeForAddrSpace (ST, AddrSpace, Opc))
1019
- return false ;
1020
-
1021
- unsigned Align = Query.MMODescrs [0 ].AlignInBits ;
1022
- unsigned RoundedSize = NextPowerOf2 (Size);
1023
- return (Align >= RoundedSize);
1024
- };
1025
-
1026
1054
unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess () ? 0 : 32 ;
1027
1055
unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess () ? 0 : 16 ;
1028
1056
unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess () ? 0 : 8 ;
@@ -1087,19 +1115,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
1087
1115
Query.MMODescrs [0 ].SizeInBits );
1088
1116
}, bitcastToRegisterType (0 ));
1089
1117
1118
+ if (!IsStore) {
1119
+ // Widen suitably aligned loads by loading extra bytes. The standard
1120
+ // legalization actions can't properly express widening memory operands.
1121
+ Actions.customIf ([=](const LegalityQuery &Query) -> bool {
1122
+ return shouldWidenLoad (ST, Query, G_LOAD);
1123
+ });
1124
+ }
1125
+
1126
+ // FIXME: load/store narrowing should be moved to lower action
1090
1127
Actions
1091
- .customIf (typeIs (1 , Constant32Ptr))
1092
- // Widen suitably aligned loads by loading extra elements.
1093
- .moreElementsIf ([=](const LegalityQuery &Query) {
1094
- const LLT Ty = Query.Types [0 ];
1095
- return Op == G_LOAD && Ty.isVector () &&
1096
- shouldWidenLoadResult (Query, Op);
1097
- }, moreElementsToNextPow2 (0 ))
1098
- .widenScalarIf ([=](const LegalityQuery &Query) {
1099
- const LLT Ty = Query.Types [0 ];
1100
- return Op == G_LOAD && !Ty.isVector () &&
1101
- shouldWidenLoadResult (Query, Op);
1102
- }, widenScalarOrEltToNextPow2 (0 ))
1103
1128
.narrowScalarIf (
1104
1129
[=](const LegalityQuery &Query) -> bool {
1105
1130
return !Query.Types [0 ].isVector () &&
@@ -1205,15 +1230,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
1205
1230
// May need relegalization for the scalars.
1206
1231
return std::make_pair (0 , EltTy);
1207
1232
})
1208
- .minScalar (0 , S32);
1233
+ .lowerIfMemSizeNotPow2 ()
1234
+ .minScalar (0 , S32);
1209
1235
1210
1236
if (IsStore)
1211
1237
Actions.narrowScalarIf (isWideScalarTruncStore (0 ), changeTo (0 , S32));
1212
1238
1213
- // TODO: Need a bitcast lower option?
1214
1239
Actions
1215
1240
.widenScalarToNextPow2 (0 )
1216
- .moreElementsIf (vectorSmallerThan (0 , 32 ), moreEltsToNext32Bit (0 ));
1241
+ .moreElementsIf (vectorSmallerThan (0 , 32 ), moreEltsToNext32Bit (0 ))
1242
+ .lower ();
1217
1243
}
1218
1244
1219
1245
auto &ExtLoads = getActionDefinitionsBuilder ({G_SEXTLOAD, G_ZEXTLOAD})
@@ -2303,6 +2329,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
2303
2329
return true ;
2304
2330
}
2305
2331
2332
+ static LLT widenToNextPowerOf2 (LLT Ty) {
2333
+ if (Ty.isVector ())
2334
+ return Ty.changeNumElements (PowerOf2Ceil (Ty.getNumElements ()));
2335
+ return LLT::scalar (PowerOf2Ceil (Ty.getSizeInBits ()));
2336
+ }
2337
+
2306
2338
bool AMDGPULegalizerInfo::legalizeLoad (LegalizerHelper &Helper,
2307
2339
MachineInstr &MI) const {
2308
2340
MachineIRBuilder &B = Helper.MIRBuilder ;
@@ -2322,6 +2354,66 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2322
2354
return true ;
2323
2355
}
2324
2356
2357
+ Register ValReg = MI.getOperand (0 ).getReg ();
2358
+ LLT ValTy = MRI.getType (ValReg);
2359
+
2360
+ MachineMemOperand *MMO = *MI.memoperands_begin ();
2361
+ const unsigned ValSize = ValTy.getSizeInBits ();
2362
+ const unsigned MemSize = 8 * MMO->getSize ();
2363
+ const Align MemAlign = MMO->getAlign ();
2364
+ const unsigned AlignInBits = 8 * MemAlign.value ();
2365
+
2366
+ // Widen non-power-of-2 loads to the alignment if needed
2367
+ if (shouldWidenLoad (ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode ())) {
2368
+ const unsigned WideMemSize = PowerOf2Ceil (MemSize);
2369
+
2370
+ // This was already the correct extending load result type, so just adjust
2371
+ // the memory type.
2372
+ if (WideMemSize == ValSize) {
2373
+ MachineFunction &MF = B.getMF ();
2374
+
2375
+ // FIXME: This is losing AA metadata
2376
+ MachineMemOperand *WideMMO =
2377
+ MF.getMachineMemOperand (MMO, 0 , WideMemSize / 8 );
2378
+ Observer.changingInstr (MI);
2379
+ MI.setMemRefs (MF, {WideMMO});
2380
+ Observer.changedInstr (MI);
2381
+ return true ;
2382
+ }
2383
+
2384
+ // Don't bother handling edge case that should probably never be produced.
2385
+ if (ValSize > WideMemSize)
2386
+ return false ;
2387
+
2388
+ LLT WideTy = widenToNextPowerOf2 (ValTy);
2389
+
2390
+ // FIXME: This is losing AA metadata
2391
+ Register WideLoad;
2392
+ if (!WideTy.isVector ()) {
2393
+ WideLoad = B.buildLoadFromOffset (WideTy, PtrReg, *MMO, 0 ).getReg (0 );
2394
+ B.buildTrunc (ValReg, WideLoad).getReg (0 );
2395
+ } else {
2396
+ // Extract the subvector.
2397
+
2398
+ if (isRegisterType (ValTy)) {
2399
+ // If this a case where G_EXTRACT is legal, use it.
2400
+ // (e.g. <3 x s32> -> <4 x s32>)
2401
+ WideLoad = B.buildLoadFromOffset (WideTy, PtrReg, *MMO, 0 ).getReg (0 );
2402
+ B.buildExtract (ValReg, WideLoad, 0 );
2403
+ } else {
2404
+ // For cases where the widened type isn't a nice register value, unmerge
2405
+ // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2406
+ B.setInsertPt (B.getMBB (), ++B.getInsertPt ());
2407
+ WideLoad = Helper.widenWithUnmerge (WideTy, ValReg);
2408
+ B.setInsertPt (B.getMBB (), MI.getIterator ());
2409
+ B.buildLoadFromOffset (WideLoad, PtrReg, *MMO, 0 );
2410
+ }
2411
+ }
2412
+
2413
+ MI.eraseFromParent ();
2414
+ return true ;
2415
+ }
2416
+
2325
2417
return false ;
2326
2418
}
2327
2419
0 commit comments