|
| 1 | +//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// |
| 9 | +// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa |
| 10 | +// or dim=2darraymsaa into a single image_msaa_load intrinsic if: |
| 11 | +// |
| 12 | +// - they refer to the same vaddr except for sample_id, |
| 13 | +// - they use a constant sample_id and they fall into the same group, |
| 14 | +// - they have the same dmask and the number of intrinsics and the number of |
| 15 | +// vaddr/vdata dword transfers is reduced by the combine. |
| 16 | +// |
| 17 | +// Examples for the tradeoff (all are assuming 2DMsaa for vaddr): |
| 18 | +// |
| 19 | +// +----------+-----+-----+-------+---------+------------+---------+----------+ |
| 20 | +// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? | |
| 21 | +// | (dmask) | | | | vdata | | vdata | | |
| 22 | +// +----------+-----+-----+-------+---------+------------+---------+----------+ |
| 23 | +// | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes | |
| 24 | +// +----------+-----+-----+-------+---------+------------+---------+----------+ |
| 25 | +// | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? | |
| 26 | +// +----------+-----+-----+-------+---------+------------+---------+----------+ |
| 27 | +// | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes | |
| 28 | +// +----------+-----+-----+-------+---------+------------+---------+----------+ |
| 29 | +// | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no | |
| 30 | +// +----------+-----+-----+-------+---------+------------+---------+----------+ |
| 31 | +// | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes | |
| 32 | +// +----------+-----+-----+-------+---------+------------+---------+----------+ |
| 33 | +// |
| 34 | +// Some cases are of questionable benefit, like the one marked with "yes?" |
| 35 | +// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP |
| 36 | +// and TX, but higher vdata. We start by erring on the side of converting these |
| 37 | +// to MSAA_LOAD. |
| 38 | +// |
| 39 | +// clang-format off |
| 40 | +// |
| 41 | +// This pass will combine intrinsics such as (not neccessarily consecutive): |
| 42 | +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) |
| 43 | +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0) |
| 44 | +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0) |
| 45 | +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0) |
| 46 | +// ==> |
| 47 | +// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) |
| 48 | +// |
| 49 | +// clang-format on |
| 50 | +// |
| 51 | +// Future improvements: |
| 52 | +// |
| 53 | +// - We may occasionally not want to do the combine if it increases the maximum |
| 54 | +// register pressure. |
| 55 | +// |
| 56 | +// - Ensure clausing when multiple MSAA_LOAD are generated. |
| 57 | +// |
| 58 | +// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this |
| 59 | +// combine only applies to gfx11, due to a limitation in gfx10: the gfx10 |
| 60 | +// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and |
| 61 | +// we don't know the format at compile time. |
| 62 | +//===----------------------------------------------------------------------===// |
| 63 | + |
| 64 | +#include "AMDGPU.h" |
| 65 | +#include "AMDGPUInstrInfo.h" |
| 66 | +#include "AMDGPUTargetMachine.h" |
| 67 | +#include "llvm/IR/Function.h" |
| 68 | +#include "llvm/IR/IRBuilder.h" |
| 69 | +#include "llvm/IR/IntrinsicInst.h" |
| 70 | +#include "llvm/IR/IntrinsicsAMDGPU.h" |
| 71 | +#include "llvm/Pass.h" |
| 72 | +#include "llvm/Support/raw_ostream.h" |
| 73 | + |
| 74 | +using namespace llvm; |
| 75 | + |
| 76 | +#define DEBUG_TYPE "amdgpu-image-intrinsic-opt" |
| 77 | + |
| 78 | +namespace { |
| 79 | +class AMDGPUImageIntrinsicOptimizer : public FunctionPass { |
| 80 | + const TargetMachine *TM; |
| 81 | + |
| 82 | +public: |
| 83 | + static char ID; |
| 84 | + |
| 85 | + AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr) |
| 86 | + : FunctionPass(ID), TM(TM) {} |
| 87 | + |
| 88 | + bool runOnFunction(Function &F) override; |
| 89 | + |
| 90 | +}; // End of class AMDGPUImageIntrinsicOptimizer |
| 91 | +} // End anonymous namespace |
| 92 | + |
| 93 | +INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, |
| 94 | + "AMDGPU Image Intrinsic Optimizer", false, false) |
| 95 | + |
| 96 | +char AMDGPUImageIntrinsicOptimizer::ID = 0; |
| 97 | + |
| 98 | +void addInstToMergeableList( |
| 99 | + IntrinsicInst *II, |
| 100 | + SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts, |
| 101 | + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { |
| 102 | + for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) { |
| 103 | + // Check Dim. |
| 104 | + if (IIList.front()->getIntrinsicID() != II->getIntrinsicID()) |
| 105 | + continue; |
| 106 | + |
| 107 | + // Check D16. |
| 108 | + if (IIList.front()->getType() != II->getType()) |
| 109 | + continue; |
| 110 | + |
| 111 | + // Check DMask. |
| 112 | + Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex); |
| 113 | + Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex); |
| 114 | + if (DMaskList != DMask) |
| 115 | + continue; |
| 116 | + |
| 117 | + // Check VAddr (except FragId). |
| 118 | + int I = ImageDimIntr->VAddrStart; |
| 119 | + for (; I < ImageDimIntr->VAddrEnd - 1; ++I) { |
| 120 | + if (IIList.front()->getArgOperand(I) != II->getArgOperand(I)) |
| 121 | + break; |
| 122 | + } |
| 123 | + |
| 124 | + if (I != ImageDimIntr->VAddrEnd - 1) |
| 125 | + continue; |
| 126 | + |
| 127 | + // Check FragId group. |
| 128 | + const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; |
| 129 | + Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex); |
| 130 | + auto IIListFragId = cast<ConstantInt>(FragIdList); |
| 131 | + auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex)); |
| 132 | + if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4)) |
| 133 | + continue; |
| 134 | + |
| 135 | + // Add to the list. |
| 136 | + IIList.emplace_back(II); |
| 137 | + return; |
| 138 | + } |
| 139 | + |
| 140 | + // Similar instruction not found, so add a new list. |
| 141 | + MergeableInsts.emplace_back(1, II); |
| 142 | + LLVM_DEBUG(dbgs() << "New: " << *II << "\n"); |
| 143 | +} |
| 144 | + |
| 145 | +// Collect list of all instructions we know how to merge in a subset of the |
| 146 | +// block. It returns an iterator to the instruction after the last one analyzed. |
| 147 | +BasicBlock::iterator collectMergeableInsts( |
| 148 | + BasicBlock::iterator I, BasicBlock::iterator E, |
| 149 | + SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { |
| 150 | + for (; I != E; ++I) { |
| 151 | + // Don't combine if there is a store in the middle or if there is a memory |
| 152 | + // barrier. |
| 153 | + if (I->mayHaveSideEffects()) { |
| 154 | + ++I; |
| 155 | + break; |
| 156 | + } |
| 157 | + |
| 158 | + // Ignore non-intrinsics. |
| 159 | + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { |
| 160 | + Intrinsic::ID IntrinID = II->getIntrinsicID(); |
| 161 | + |
| 162 | + // Ignore other intrinsics. |
| 163 | + if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa && |
| 164 | + IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa) |
| 165 | + continue; |
| 166 | + |
| 167 | + // Check for constant FragId. |
| 168 | + const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID); |
| 169 | + const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; |
| 170 | + if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex))) |
| 171 | + continue; |
| 172 | + |
| 173 | + LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n"); |
| 174 | + addInstToMergeableList(II, MergeableInsts, ImageDimIntr); |
| 175 | + } |
| 176 | + } |
| 177 | + |
| 178 | + return I; |
| 179 | +} |
| 180 | + |
| 181 | +bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { |
| 182 | + bool Modified = false; |
| 183 | + |
| 184 | + SmallVector<Instruction *, 4> InstrsToErase; |
| 185 | + for (const auto &IIList : MergeableInsts) { |
| 186 | + if (IIList.size() <= 1) |
| 187 | + continue; |
| 188 | + |
| 189 | + // Assume the arguments are unchanged and later override them, if needed. |
| 190 | + SmallVector<Value *, 16> Args(IIList.front()->args()); |
| 191 | + |
| 192 | + // Validate function argument and return types, extracting overloaded |
| 193 | + // types along the way. |
| 194 | + SmallVector<Type *, 6> OverloadTys; |
| 195 | + Function *F = IIList.front()->getCalledFunction(); |
| 196 | + if (!Intrinsic::getIntrinsicSignature(F, OverloadTys)) |
| 197 | + continue; |
| 198 | + |
| 199 | + Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID(); |
| 200 | + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = |
| 201 | + AMDGPU::getImageDimIntrinsicInfo(IntrinID); |
| 202 | + |
| 203 | + Type *EltTy = IIList.front()->getType()->getScalarType(); |
| 204 | + Type *NewTy = FixedVectorType::get(EltTy, 4); |
| 205 | + OverloadTys[0] = NewTy; |
| 206 | + bool isD16 = EltTy->isHalfTy(); |
| 207 | + |
| 208 | + ConstantInt *DMask = cast<ConstantInt>( |
| 209 | + IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex)); |
| 210 | + unsigned DMaskVal = DMask->getZExtValue() & 0xf; |
| 211 | + unsigned NumElts = popcount(DMaskVal); |
| 212 | + |
| 213 | + // Number of instructions and the number of vaddr/vdata dword transfers |
| 214 | + // should be reduced. |
| 215 | + unsigned NumLoads = IIList.size(); |
| 216 | + unsigned NumMsaas = NumElts; |
| 217 | + unsigned NumVAddrLoads = 3 * NumLoads; |
| 218 | + unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads; |
| 219 | + unsigned NumVAddrMsaas = 3 * NumMsaas; |
| 220 | + unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas; |
| 221 | + |
| 222 | + if (NumLoads < NumMsaas || |
| 223 | + (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas)) |
| 224 | + continue; |
| 225 | + |
| 226 | + const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; |
| 227 | + auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex)); |
| 228 | + const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4; |
| 229 | + |
| 230 | + // Create the new instructions. |
| 231 | + IRBuilder<> B(IIList.front()); |
| 232 | + |
| 233 | + // Create the new image_msaa_load intrinsic. |
| 234 | + SmallVector<Instruction *, 4> NewCalls; |
| 235 | + while (DMaskVal != 0) { |
| 236 | + unsigned NewMaskVal = 1 << countr_zero(DMaskVal); |
| 237 | + |
| 238 | + Intrinsic::ID NewIntrinID; |
| 239 | + if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa) |
| 240 | + NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa; |
| 241 | + else |
| 242 | + NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; |
| 243 | + |
| 244 | + Function *NewIntrin = Intrinsic::getDeclaration( |
| 245 | + IIList.front()->getModule(), NewIntrinID, OverloadTys); |
| 246 | + Args[ImageDimIntr->DMaskIndex] = |
| 247 | + ConstantInt::get(DMask->getType(), NewMaskVal); |
| 248 | + Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal); |
| 249 | + CallInst *NewCall = B.CreateCall(NewIntrin, Args); |
| 250 | + LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n"); |
| 251 | + |
| 252 | + NewCalls.push_back(NewCall); |
| 253 | + DMaskVal -= NewMaskVal; |
| 254 | + } |
| 255 | + |
| 256 | + // Create the new extractelement instructions. |
| 257 | + for (auto &II : IIList) { |
| 258 | + Value *VecOp = nullptr; |
| 259 | + auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex)); |
| 260 | + B.SetCurrentDebugLocation(II->getDebugLoc()); |
| 261 | + if (NumElts == 1) { |
| 262 | + VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4)); |
| 263 | + LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); |
| 264 | + } else { |
| 265 | + VecOp = UndefValue::get(II->getType()); |
| 266 | + for (unsigned I = 0; I < NumElts; ++I) { |
| 267 | + VecOp = B.CreateInsertElement( |
| 268 | + VecOp, |
| 269 | + B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I); |
| 270 | + LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); |
| 271 | + } |
| 272 | + } |
| 273 | + |
| 274 | + // Replace the old instruction. |
| 275 | + II->replaceAllUsesWith(VecOp); |
| 276 | + VecOp->takeName(II); |
| 277 | + InstrsToErase.push_back(II); |
| 278 | + } |
| 279 | + |
| 280 | + Modified = true; |
| 281 | + } |
| 282 | + |
| 283 | + for (auto I : InstrsToErase) |
| 284 | + I->eraseFromParent(); |
| 285 | + |
| 286 | + return Modified; |
| 287 | +} |
| 288 | + |
| 289 | +static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { |
| 290 | + if (!TM) |
| 291 | + return false; |
| 292 | + |
| 293 | + // This optimization only applies to GFX11 and beyond. |
| 294 | + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); |
| 295 | + if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug()) |
| 296 | + return false; |
| 297 | + |
| 298 | + Module *M = F.getParent(); |
| 299 | + |
| 300 | + // Early test to determine if the intrinsics are used. |
| 301 | + if (std::none_of(M->begin(), M->end(), [](Function &F) { |
| 302 | + return !F.users().empty() && |
| 303 | + (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa || |
| 304 | + F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa); |
| 305 | + })) |
| 306 | + return false; |
| 307 | + |
| 308 | + bool Modified = false; |
| 309 | + for (auto &BB : F) { |
| 310 | + BasicBlock::iterator SectionEnd; |
| 311 | + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; |
| 312 | + I = SectionEnd) { |
| 313 | + SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts; |
| 314 | + |
| 315 | + SectionEnd = collectMergeableInsts(I, E, MergeableInsts); |
| 316 | + Modified |= optimizeSection(MergeableInsts); |
| 317 | + } |
| 318 | + } |
| 319 | + |
| 320 | + return Modified; |
| 321 | +} |
| 322 | + |
| 323 | +bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { |
| 324 | + if (skipFunction(F)) |
| 325 | + return false; |
| 326 | + |
| 327 | + return imageIntrinsicOptimizerImpl(F, TM); |
| 328 | +} |
| 329 | + |
| 330 | +FunctionPass * |
| 331 | +llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { |
| 332 | + return new AMDGPUImageIntrinsicOptimizer(TM); |
| 333 | +} |
| 334 | + |
| 335 | +PreservedAnalyses |
| 336 | +AMDGPUImageIntrinsicOptimizerPass::run(Function &F, |
| 337 | + FunctionAnalysisManager &AM) { |
| 338 | + |
| 339 | + bool Changed = imageIntrinsicOptimizerImpl(F, &TM); |
| 340 | + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
| 341 | +} |
0 commit comments