Skip to content

Commit d85d143

Browse files
authored
[AMDGPU] New image intrinsic optimizer pass (#67151)
Implement a new pass to combine multiple image_load_2dmsaa and 2darraymsaa intrinsic calls into a single image_msaa_load if: - they refer to the same vaddr except for sample_id, - they use a constant sample_id and they fall into the same group, - they have the same dmask and the number of instructions and the number of vaddr/vdata dword transfers is reduced by the combine This should be valid on all GFX11 but a hardware bug renders it unworkable on GFX11.0.* so it is only enabled for GFX11.5. Based on a patch by Rodrigo Dominguez!
1 parent b797a6a commit d85d143

File tree

9 files changed

+1583
-1
lines changed

9 files changed

+1583
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ FunctionPass *createSIPreAllocateWWMRegsPass();
4949
FunctionPass *createSIFormMemoryClausesPass();
5050

5151
FunctionPass *createSIPostRABundlerPass();
52+
FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
5253
ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
5354
FunctionPass *createAMDGPUCodeGenPreparePass();
5455
FunctionPass *createAMDGPULateCodeGenPreparePass();
@@ -64,6 +65,15 @@ struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
6465
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
6566
};
6667

68+
struct AMDGPUImageIntrinsicOptimizerPass
69+
: PassInfoMixin<AMDGPUImageIntrinsicOptimizerPass> {
70+
AMDGPUImageIntrinsicOptimizerPass(TargetMachine &TM) : TM(TM) {}
71+
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
72+
73+
private:
74+
TargetMachine &TM;
75+
};
76+
6777
struct AMDGPUUseNativeCallsPass : PassInfoMixin<AMDGPUUseNativeCallsPass> {
6878
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
6979
};
@@ -175,6 +185,9 @@ extern char &SIOptimizeExecMaskingID;
175185
void initializeSIPreAllocateWWMRegsPass(PassRegistry &);
176186
extern char &SIPreAllocateWWMRegsID;
177187

188+
void initializeAMDGPUImageIntrinsicOptimizerPass(PassRegistry &);
189+
extern char &AMDGPUImageIntrinsicOptimizerID;
190+
178191
void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
179192
extern char &AMDGPUPerfHintAnalysisID;
180193

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,12 @@ def FeatureMADIntraFwdBug : SubtargetFeature<"mad-intra-fwd-bug",
281281
"MAD_U64/I64 intra instruction forwarding bug"
282282
>;
283283

284+
def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
285+
"HasMSAALoadDstSelBug",
286+
"true",
287+
"MSAA loads not honoring dst_sel bug"
288+
>;
289+
284290
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
285291
"ldsbankcount"#Value,
286292
"LDSBankCount",
@@ -1355,7 +1361,8 @@ def FeatureISAVersion11_Common : FeatureSet<
13551361

13561362
def FeatureISAVersion11_0_Common : FeatureSet<
13571363
!listconcat(FeatureISAVersion11_Common.Features,
1358-
[FeatureVALUTransUseHazard])>;
1364+
[FeatureMSAALoadDstSelBug,
1365+
FeatureVALUTransUseHazard])>;
13591366

13601367
def FeatureISAVersion11_0_0 : FeatureSet<
13611368
!listconcat(FeatureISAVersion11_0_Common.Features,
Lines changed: 341 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,341 @@
1+
//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
10+
// or dim=2darraymsaa into a single image_msaa_load intrinsic if:
11+
//
12+
// - they refer to the same vaddr except for sample_id,
13+
// - they use a constant sample_id and they fall into the same group,
14+
// - they have the same dmask and the number of intrinsics and the number of
15+
// vaddr/vdata dword transfers is reduced by the combine.
16+
//
17+
// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
18+
//
19+
// +----------+-----+-----+-------+---------+------------+---------+----------+
20+
// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
21+
// | (dmask) | | | | vdata | | vdata | |
22+
// +----------+-----+-----+-------+---------+------------+---------+----------+
23+
// | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes |
24+
// +----------+-----+-----+-------+---------+------------+---------+----------+
25+
// | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? |
26+
// +----------+-----+-----+-------+---------+------------+---------+----------+
27+
// | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes |
28+
// +----------+-----+-----+-------+---------+------------+---------+----------+
29+
// | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no |
30+
// +----------+-----+-----+-------+---------+------------+---------+----------+
31+
// | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes |
32+
// +----------+-----+-----+-------+---------+------------+---------+----------+
33+
//
34+
// Some cases are of questionable benefit, like the one marked with "yes?"
35+
// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
36+
// and TX, but higher vdata. We start by erring on the side of converting these
37+
// to MSAA_LOAD.
38+
//
39+
// clang-format off
40+
//
41+
// This pass will combine intrinsics such as (not neccessarily consecutive):
42+
// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
43+
// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
44+
// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
45+
// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
46+
// ==>
47+
// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
48+
//
49+
// clang-format on
50+
//
51+
// Future improvements:
52+
//
53+
// - We may occasionally not want to do the combine if it increases the maximum
54+
// register pressure.
55+
//
56+
// - Ensure clausing when multiple MSAA_LOAD are generated.
57+
//
58+
// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
59+
// combine only applies to gfx11, due to a limitation in gfx10: the gfx10
60+
// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
61+
// we don't know the format at compile time.
62+
//===----------------------------------------------------------------------===//
63+
64+
#include "AMDGPU.h"
65+
#include "AMDGPUInstrInfo.h"
66+
#include "AMDGPUTargetMachine.h"
67+
#include "llvm/IR/Function.h"
68+
#include "llvm/IR/IRBuilder.h"
69+
#include "llvm/IR/IntrinsicInst.h"
70+
#include "llvm/IR/IntrinsicsAMDGPU.h"
71+
#include "llvm/Pass.h"
72+
#include "llvm/Support/raw_ostream.h"
73+
74+
using namespace llvm;
75+
76+
#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77+
78+
namespace {
79+
class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
80+
const TargetMachine *TM;
81+
82+
public:
83+
static char ID;
84+
85+
AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
86+
: FunctionPass(ID), TM(TM) {}
87+
88+
bool runOnFunction(Function &F) override;
89+
90+
}; // End of class AMDGPUImageIntrinsicOptimizer
91+
} // End anonymous namespace
92+
93+
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
94+
"AMDGPU Image Intrinsic Optimizer", false, false)
95+
96+
char AMDGPUImageIntrinsicOptimizer::ID = 0;
97+
98+
void addInstToMergeableList(
99+
IntrinsicInst *II,
100+
SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
101+
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
102+
for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
103+
// Check Dim.
104+
if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105+
continue;
106+
107+
// Check D16.
108+
if (IIList.front()->getType() != II->getType())
109+
continue;
110+
111+
// Check DMask.
112+
Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex);
113+
Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex);
114+
if (DMaskList != DMask)
115+
continue;
116+
117+
// Check VAddr (except FragId).
118+
int I = ImageDimIntr->VAddrStart;
119+
for (; I < ImageDimIntr->VAddrEnd - 1; ++I) {
120+
if (IIList.front()->getArgOperand(I) != II->getArgOperand(I))
121+
break;
122+
}
123+
124+
if (I != ImageDimIntr->VAddrEnd - 1)
125+
continue;
126+
127+
// Check FragId group.
128+
const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
129+
Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex);
130+
auto IIListFragId = cast<ConstantInt>(FragIdList);
131+
auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
132+
if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4))
133+
continue;
134+
135+
// Add to the list.
136+
IIList.emplace_back(II);
137+
return;
138+
}
139+
140+
// Similar instruction not found, so add a new list.
141+
MergeableInsts.emplace_back(1, II);
142+
LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
143+
}
144+
145+
// Collect list of all instructions we know how to merge in a subset of the
146+
// block. It returns an iterator to the instruction after the last one analyzed.
147+
BasicBlock::iterator collectMergeableInsts(
148+
BasicBlock::iterator I, BasicBlock::iterator E,
149+
SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {
150+
for (; I != E; ++I) {
151+
// Don't combine if there is a store in the middle or if there is a memory
152+
// barrier.
153+
if (I->mayHaveSideEffects()) {
154+
++I;
155+
break;
156+
}
157+
158+
// Ignore non-intrinsics.
159+
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
160+
Intrinsic::ID IntrinID = II->getIntrinsicID();
161+
162+
// Ignore other intrinsics.
163+
if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
164+
IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
165+
continue;
166+
167+
// Check for constant FragId.
168+
const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
169+
const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
170+
if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
171+
continue;
172+
173+
LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
174+
addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
175+
}
176+
}
177+
178+
return I;
179+
}
180+
181+
bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
182+
bool Modified = false;
183+
184+
SmallVector<Instruction *, 4> InstrsToErase;
185+
for (const auto &IIList : MergeableInsts) {
186+
if (IIList.size() <= 1)
187+
continue;
188+
189+
// Assume the arguments are unchanged and later override them, if needed.
190+
SmallVector<Value *, 16> Args(IIList.front()->args());
191+
192+
// Validate function argument and return types, extracting overloaded
193+
// types along the way.
194+
SmallVector<Type *, 6> OverloadTys;
195+
Function *F = IIList.front()->getCalledFunction();
196+
if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
197+
continue;
198+
199+
Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
200+
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
201+
AMDGPU::getImageDimIntrinsicInfo(IntrinID);
202+
203+
Type *EltTy = IIList.front()->getType()->getScalarType();
204+
Type *NewTy = FixedVectorType::get(EltTy, 4);
205+
OverloadTys[0] = NewTy;
206+
bool isD16 = EltTy->isHalfTy();
207+
208+
ConstantInt *DMask = cast<ConstantInt>(
209+
IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
210+
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
211+
unsigned NumElts = popcount(DMaskVal);
212+
213+
// Number of instructions and the number of vaddr/vdata dword transfers
214+
// should be reduced.
215+
unsigned NumLoads = IIList.size();
216+
unsigned NumMsaas = NumElts;
217+
unsigned NumVAddrLoads = 3 * NumLoads;
218+
unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
219+
unsigned NumVAddrMsaas = 3 * NumMsaas;
220+
unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
221+
222+
if (NumLoads < NumMsaas ||
223+
(NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
224+
continue;
225+
226+
const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
227+
auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
228+
const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
229+
230+
// Create the new instructions.
231+
IRBuilder<> B(IIList.front());
232+
233+
// Create the new image_msaa_load intrinsic.
234+
SmallVector<Instruction *, 4> NewCalls;
235+
while (DMaskVal != 0) {
236+
unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
237+
238+
Intrinsic::ID NewIntrinID;
239+
if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
240+
NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
241+
else
242+
NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
243+
244+
Function *NewIntrin = Intrinsic::getDeclaration(
245+
IIList.front()->getModule(), NewIntrinID, OverloadTys);
246+
Args[ImageDimIntr->DMaskIndex] =
247+
ConstantInt::get(DMask->getType(), NewMaskVal);
248+
Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
249+
CallInst *NewCall = B.CreateCall(NewIntrin, Args);
250+
LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
251+
252+
NewCalls.push_back(NewCall);
253+
DMaskVal -= NewMaskVal;
254+
}
255+
256+
// Create the new extractelement instructions.
257+
for (auto &II : IIList) {
258+
Value *VecOp = nullptr;
259+
auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
260+
B.SetCurrentDebugLocation(II->getDebugLoc());
261+
if (NumElts == 1) {
262+
VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
263+
LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
264+
} else {
265+
VecOp = UndefValue::get(II->getType());
266+
for (unsigned I = 0; I < NumElts; ++I) {
267+
VecOp = B.CreateInsertElement(
268+
VecOp,
269+
B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
270+
LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
271+
}
272+
}
273+
274+
// Replace the old instruction.
275+
II->replaceAllUsesWith(VecOp);
276+
VecOp->takeName(II);
277+
InstrsToErase.push_back(II);
278+
}
279+
280+
Modified = true;
281+
}
282+
283+
for (auto I : InstrsToErase)
284+
I->eraseFromParent();
285+
286+
return Modified;
287+
}
288+
289+
static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
290+
if (!TM)
291+
return false;
292+
293+
// This optimization only applies to GFX11 and beyond.
294+
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
295+
if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
296+
return false;
297+
298+
Module *M = F.getParent();
299+
300+
// Early test to determine if the intrinsics are used.
301+
if (std::none_of(M->begin(), M->end(), [](Function &F) {
302+
return !F.users().empty() &&
303+
(F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
304+
F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
305+
}))
306+
return false;
307+
308+
bool Modified = false;
309+
for (auto &BB : F) {
310+
BasicBlock::iterator SectionEnd;
311+
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
312+
I = SectionEnd) {
313+
SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;
314+
315+
SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
316+
Modified |= optimizeSection(MergeableInsts);
317+
}
318+
}
319+
320+
return Modified;
321+
}
322+
323+
bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
324+
if (skipFunction(F))
325+
return false;
326+
327+
return imageIntrinsicOptimizerImpl(F, TM);
328+
}
329+
330+
FunctionPass *
331+
llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
332+
return new AMDGPUImageIntrinsicOptimizer(TM);
333+
}
334+
335+
PreservedAnalyses
336+
AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
337+
FunctionAnalysisManager &AM) {
338+
339+
bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
340+
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
341+
}

0 commit comments

Comments
 (0)