Skip to content

Commit a565d43

Browse files
jrbyrnesbcahoon
authored andcommitted
[AMDGPU] Add off-by-default flag to control LiveRegOpt
Change-Id: Id939bf74b48b47e5ee2b432956e476fac80e3375 (cherry picked from commit 5b1a599)
1 parent fa5860c commit a565d43

16 files changed

+4160
-411
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
180180
for (auto &BB : reverse(F))
181181
for (Instruction &I : make_early_inc_range(reverse(BB))) {
182182
Changed |= !HasScalarSubwordLoads && visit(I);
183-
Changed |= LRO.optimizeLiveType(&I, DeadInsts);
183+
if (ST.shouldCoerceIllegalTypes())
184+
Changed |= LRO.optimizeLiveType(&I, DeadInsts);
184185
}
185186

186187
RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
5959
cl::desc("Number of addresses from which to enable MIMG NSA."),
6060
cl::init(3), cl::Hidden);
6161

62+
static cl::opt<bool>
63+
CoerceIllegal("amdgpu-coerce-illegal-types",
64+
cl::desc("Whether or not to coerce illegal types"),
65+
cl::ReallyHidden, cl::init(false));
66+
6267
GCNSubtarget::~GCNSubtarget() = default;
6368

6469
GCNSubtarget &
@@ -199,6 +204,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
199204
RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
200205
InstSelector.reset(new AMDGPUInstructionSelector(
201206
*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
207+
208+
ShouldCoerceIllegalTypes = CoerceIllegal;
202209
}
203210

204211
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,8 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
314314
// queries (e.g. get*InstrCost) to decide the proper handling
315315
// of 8 bit vectors.
316316
if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
317-
if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
317+
if (ST->shouldCoerceIllegalTypes() &&
318+
DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
318319
unsigned ElCount = VTy->getElementCount().getFixedValue();
319320
return PowerOf2Ceil(ElCount / 4);
320321
}
@@ -355,10 +356,10 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
355356
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
356357
return 32 * 4 / ElemWidth;
357358

358-
return (ElemWidth == 8) ? 4
359-
: (ElemWidth == 16) ? 2
360-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
361-
: 1;
359+
return (ST->shouldCoerceIllegalTypes() && ElemWidth == 8) ? 4
360+
: (ElemWidth == 16) ? 2
361+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
362+
: 1;
362363
}
363364

364365
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1154,7 +1155,8 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11541155

11551156
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
11561157
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1157-
(ScalarSize == 16 || ScalarSize == 8)) {
1158+
(ScalarSize == 16 ||
1159+
(ScalarSize == 8 && ST->shouldCoerceIllegalTypes()))) {
11581160
// Larger vector widths may require additional instructions, but are
11591161
// typically cheaper than scalarized versions.
11601162
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
235235
// Dummy feature to use for assembler in tablegen.
236236
bool FeatureDisable = false;
237237

238+
bool ShouldCoerceIllegalTypes = false;
239+
238240
SelectionDAGTargetInfo TSInfo;
239241
private:
240242
SIInstrInfo InstrInfo;
@@ -1305,6 +1307,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13051307
// of sign-extending.
13061308
bool hasGetPCZeroExtension() const { return GFX12Insts; }
13071309

1310+
/// \returns whether or not we should coerce illegal types into vectors of
1311+
// legal types for values that span basic blocks.
1312+
bool shouldCoerceIllegalTypes() const { return ShouldCoerceIllegalTypes; }
1313+
13081314
/// \returns SGPR allocation granularity supported by the subtarget.
13091315
unsigned getSGPRAllocGranule() const {
13101316
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);

llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll

Lines changed: 92 additions & 92 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,30 +8,29 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
88
; CHECK: ; %bb.0: ; %bb
99
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
1010
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8
11-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
1211
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
1312
; CHECK-NEXT: s_bitcmp0_b32 s0, 0
1413
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
1514
; CHECK-NEXT: ; %bb.1: ; %bb10
16-
; CHECK-NEXT: global_load_dwordx2 v[8:9], v0, s[8:9]
15+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
16+
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9]
1717
; CHECK-NEXT: s_waitcnt vmcnt(0)
18-
; CHECK-NEXT: v_and_b32_e32 v7, 0xff, v8
19-
; CHECK-NEXT: v_bfe_u32 v6, v8, 8, 8
20-
; CHECK-NEXT: v_bfe_u32 v5, v8, 16, 8
21-
; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8
22-
; CHECK-NEXT: v_and_b32_e32 v3, 0xff, v9
23-
; CHECK-NEXT: v_bfe_u32 v2, v9, 8, 8
24-
; CHECK-NEXT: v_bfe_u32 v1, v9, 16, 8
25-
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 24, v9
18+
; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v0
19+
; CHECK-NEXT: v_lshrrev_b32_e32 v6, 16, v0
20+
; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v0
21+
; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v1
22+
; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
23+
; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v1
2624
; CHECK-NEXT: s_branch .LBB0_3
2725
; CHECK-NEXT: .LBB0_2:
28-
; CHECK-NEXT: v_mov_b32_e32 v1, 0
2926
; CHECK-NEXT: v_mov_b32_e32 v2, 0
3027
; CHECK-NEXT: v_mov_b32_e32 v3, 0
3128
; CHECK-NEXT: v_mov_b32_e32 v4, 0
29+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
3230
; CHECK-NEXT: v_mov_b32_e32 v5, 0
3331
; CHECK-NEXT: v_mov_b32_e32 v6, 0
3432
; CHECK-NEXT: v_mov_b32_e32 v7, 0
33+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
3534
; CHECK-NEXT: .LBB0_3: ; %bb41
3635
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48
3736
; CHECK-NEXT: v_mov_b32_e32 v8, s10
@@ -48,16 +47,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
4847
; CHECK-NEXT: v_mov_b32_e32 v19, s21
4948
; CHECK-NEXT: v_mov_b32_e32 v20, s22
5049
; CHECK-NEXT: v_mov_b32_e32 v21, s23
51-
; CHECK-NEXT: flat_store_byte v[8:9], v7
52-
; CHECK-NEXT: flat_store_byte v[10:11], v6
53-
; CHECK-NEXT: flat_store_byte v[12:13], v5
54-
; CHECK-NEXT: flat_store_byte v[14:15], v4
55-
; CHECK-NEXT: flat_store_byte v[16:17], v3
56-
; CHECK-NEXT: flat_store_byte v[18:19], v2
57-
; CHECK-NEXT: flat_store_byte v[20:21], v1
50+
; CHECK-NEXT: flat_store_byte v[8:9], v0
51+
; CHECK-NEXT: flat_store_byte v[10:11], v7
52+
; CHECK-NEXT: flat_store_byte v[12:13], v6
53+
; CHECK-NEXT: flat_store_byte v[14:15], v5
54+
; CHECK-NEXT: flat_store_byte v[16:17], v1
55+
; CHECK-NEXT: flat_store_byte v[18:19], v4
56+
; CHECK-NEXT: flat_store_byte v[20:21], v3
5857
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
59-
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
60-
; CHECK-NEXT: flat_store_byte v[2:3], v0
58+
; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
59+
; CHECK-NEXT: flat_store_byte v[0:1], v2
6160
; CHECK-NEXT: s_endpgm
6261
bb:
6362
br i1 %arg, label %bb10, label %bb41

0 commit comments

Comments
 (0)