14
14
15
15
#include " AMDGPU.h"
16
16
#include " AMDGPUTargetMachine.h"
17
+ #include " AMDGPUTargetTransformInfo.h"
17
18
#include " llvm/Analysis/AssumptionCache.h"
18
19
#include " llvm/Analysis/UniformityAnalysis.h"
19
20
#include " llvm/Analysis/ValueTracking.h"
20
21
#include " llvm/CodeGen/TargetPassConfig.h"
21
22
#include " llvm/IR/IRBuilder.h"
22
23
#include " llvm/IR/InstVisitor.h"
24
+ #include " llvm/IR/IntrinsicsAMDGPU.h"
23
25
#include " llvm/Support/CommandLine.h"
24
26
#include " llvm/Support/KnownBits.h"
25
27
#include " llvm/Transforms/Utils/Local.h"
@@ -45,6 +47,7 @@ class AMDGPULateCodeGenPrepare
45
47
Function &F;
46
48
const DataLayout &DL;
47
49
const GCNSubtarget &ST;
50
+ const TargetTransformInfo &TTI;
48
51
49
52
AssumptionCache *const AC;
50
53
UniformityInfo &UA;
@@ -53,8 +56,9 @@ class AMDGPULateCodeGenPrepare
53
56
54
57
public:
55
58
AMDGPULateCodeGenPrepare (Function &F, const GCNSubtarget &ST,
56
- AssumptionCache *AC, UniformityInfo &UA)
57
- : F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
59
+ const TargetTransformInfo &TTI, AssumptionCache *AC,
60
+ UniformityInfo &UA)
61
+ : F(F), DL(F.getDataLayout()), ST(ST), TTI(TTI), AC(AC), UA(UA) {}
58
62
bool run ();
59
63
bool visitInstruction (Instruction &) { return false ; }
60
64
@@ -75,6 +79,8 @@ class LiveRegOptimizer {
75
79
Module &Mod;
76
80
const DataLayout &DL;
77
81
const GCNSubtarget &ST;
82
+ const TargetTransformInfo &TTI;
83
+
78
84
// / The scalar type to convert to
79
85
Type *const ConvertToScalar;
80
86
// / The set of visited Instructions
@@ -125,8 +131,210 @@ class LiveRegOptimizer {
125
131
return LK.first != TargetLoweringBase::TypeLegal;
126
132
}
127
133
128
- LiveRegOptimizer (Module &Mod, const GCNSubtarget &ST)
129
- : Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
134
+ // Filtering based on operation or its cost.
135
+ // If an operation incurs high enough cost or natively work on
136
+ // vector of illegal type, ie. v2i8, then it makes sense to try
137
+ // to coerce them as packed VGPR across BB.
138
+ bool shouldReplaceByOp (Instruction *II) {
139
+ static const int SCALARIZE_INST_COST = 2 ;
140
+ static const int LRO_COST_THRES = 12 ;
141
+
142
+ // Ignore pseudos
143
+ if (II->isDebugOrPseudoInst ())
144
+ return false ;
145
+
146
+ // Instruction Cost
147
+ auto Cost = TTI.getInstructionCost (
148
+ II, TargetTransformInfo::TargetCostKind::TCK_SizeAndLatency);
149
+ if (const auto *Def = II->getOperand (0 )) {
150
+ if (const auto *DefTy = dyn_cast<FixedVectorType>(Def->getType ())) {
151
+ const auto *ElTy = dyn_cast<IntegerType>(DefTy->getElementType ());
152
+ // Assume vNi8 and vNi16 will be scalarized.
153
+ if (ElTy && ElTy->getBitWidth () <= 16 ) {
154
+ const auto ElCount = DefTy->getElementCount ().getFixedValue ();
155
+ Cost += SCALARIZE_INST_COST * ElCount;
156
+ }
157
+ }
158
+ }
159
+ LLVM_DEBUG (dbgs () << " shouldReplaceByOp: " << *II << " Cost=" << Cost
160
+ << ' \n ' ;);
161
+ if (Cost >= LRO_COST_THRES)
162
+ return true ;
163
+
164
+ if (isOpLegal (II))
165
+ return true ;
166
+
167
+ return false ;
168
+ }
169
+
170
+ // / Check if intrinsic natively operates on 8-bit or 16-bit
171
+ bool isNativeIntrinsic (Intrinsic::ID ID) {
172
+ switch (ID) {
173
+ case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
174
+ case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
175
+ case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
176
+ case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
177
+ case Intrinsic::amdgcn_fdot2_f16_f16:
178
+ case Intrinsic::amdgcn_fdot2:
179
+ case Intrinsic::amdgcn_sdot4:
180
+ case Intrinsic::amdgcn_sdot2:
181
+ case Intrinsic::amdgcn_sdot8:
182
+ case Intrinsic::amdgcn_udot2:
183
+ case Intrinsic::amdgcn_udot4:
184
+ case Intrinsic::amdgcn_udot8:
185
+ case Intrinsic::amdgcn_sudot4:
186
+ case Intrinsic::amdgcn_sudot8:
187
+ case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
188
+ case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
189
+ case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
190
+ case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
191
+ case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
192
+ case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
193
+ case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
194
+ case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
195
+ case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
196
+ case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
197
+ case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
198
+ case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
199
+ case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
200
+ case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
201
+ case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
202
+ case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
203
+ case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
204
+ case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
205
+ case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
206
+ case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
207
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
208
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
209
+ case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
210
+ case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
211
+ case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
212
+ case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
213
+ case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
214
+ case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
215
+ case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
216
+ case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
217
+ case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
218
+ case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
219
+ case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
220
+ case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
221
+ case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
222
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
223
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
224
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
225
+ case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
226
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
227
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
228
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
229
+ case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
230
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
231
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
232
+ case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
233
+ case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
234
+ case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
235
+ case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
236
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
237
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
238
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
239
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
240
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
241
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
242
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
243
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
244
+ case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
245
+ case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
246
+ case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
247
+ case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
248
+ case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
249
+ case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
250
+ case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
251
+ case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
252
+ case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
253
+ case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
254
+ case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
255
+ case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
256
+ case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
257
+ case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4:
258
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
259
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
260
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
261
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
262
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
263
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
264
+ case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
265
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
266
+ case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
267
+ case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
268
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
269
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
270
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
271
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
272
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
273
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
274
+ case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
275
+ case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
276
+ case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
277
+ case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
278
+ case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
279
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
280
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
281
+ case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
282
+ return true ;
283
+ default :
284
+ return false ;
285
+ }
286
+ }
287
+
288
+ bool isOpLegal (Instruction *I) {
289
+ Type *T = I->getType ();
290
+ if (!TTI.isTypeLegal (T)) {
291
+ if (const auto Intr = dyn_cast<IntrinsicInst>(I)) {
292
+ Intrinsic::ID ID = Intr->getIntrinsicID ();
293
+ if (isNativeIntrinsic (ID))
294
+ return true ;
295
+ }
296
+ // Stores
297
+ if (isa<StoreInst>(I))
298
+ return true ;
299
+ return false ;
300
+ }
301
+ return true ;
302
+ }
303
+
304
+ bool isCoercionProfitable (Instruction *II) {
305
+ if (shouldReplaceByOp (II))
306
+ return true ;
307
+
308
+ // Look through Users
309
+ bool Profitable = false ;
310
+ SmallPtrSet<Instruction *, 4 > CVisited;
311
+ SmallVector<Instruction *, 4 > UserList;
312
+ for (User *V : II->users ())
313
+ if (auto *UseInst = dyn_cast<Instruction>(V))
314
+ UserList.push_back (UseInst);
315
+
316
+ while (!UserList.empty () && !Profitable) {
317
+ auto CII = UserList.pop_back_val ();
318
+ if (!CVisited.insert (II).second )
319
+ continue ;
320
+
321
+ if (isa<PHINode>(CII) || isa<ShuffleVectorInst>(CII) ||
322
+ isa<InsertElementInst>(CII) || isa<ExtractElementInst>(CII))
323
+ for (User *V : CII->users ())
324
+ if (auto *UseInst = dyn_cast<Instruction>(V))
325
+ UserList.push_back (UseInst);
326
+
327
+ if (CII->getParent () == II->getParent ())
328
+ continue ;
329
+
330
+ Profitable = shouldReplaceByOp (CII);
331
+ }
332
+ return Profitable;
333
+ }
334
+
335
+ LiveRegOptimizer (Module &Mod, const GCNSubtarget &ST,
336
+ const TargetTransformInfo &TTI)
337
+ : Mod(Mod), DL(Mod.getDataLayout()), ST(ST), TTI(TTI),
130
338
ConvertToScalar (Type::getInt32Ty(Mod.getContext())) {}
131
339
};
132
340
@@ -140,7 +348,7 @@ bool AMDGPULateCodeGenPrepare::run() {
140
348
// vectors to equivalent vectors of legal type (which are converted back
141
349
// before uses in subsequent blocks), to pack the bits into fewer physical
142
350
// registers (used in CopyToReg/CopyFromReg pairs).
143
- LiveRegOptimizer LRO (*F.getParent (), ST);
351
+ LiveRegOptimizer LRO (*F.getParent (), ST, TTI );
144
352
145
353
bool Changed = false ;
146
354
@@ -259,6 +467,9 @@ bool LiveRegOptimizer::optimizeLiveType(
259
467
if (!shouldReplace (II->getType ()))
260
468
continue ;
261
469
470
+ if (!isCoercionProfitable (II))
471
+ continue ;
472
+
262
473
if (PHINode *Phi = dyn_cast<PHINode>(II)) {
263
474
PhiNodes.insert (Phi);
264
475
// Collect all the incoming values of problematic PHI nodes.
@@ -478,11 +689,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
478
689
PreservedAnalyses
479
690
AMDGPULateCodeGenPreparePass::run (Function &F, FunctionAnalysisManager &FAM) {
480
691
const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
692
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo (F);
481
693
482
694
AssumptionCache &AC = FAM.getResult <AssumptionAnalysis>(F);
483
695
UniformityInfo &UI = FAM.getResult <UniformityInfoAnalysis>(F);
484
696
485
- bool Changed = AMDGPULateCodeGenPrepare (F, ST, &AC, UI).run ();
697
+ bool Changed = AMDGPULateCodeGenPrepare (F, ST, TTI, &AC, UI).run ();
486
698
487
699
if (!Changed)
488
700
return PreservedAnalyses::all ();
@@ -518,13 +730,14 @@ bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
518
730
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
519
731
const TargetMachine &TM = TPC.getTM <TargetMachine>();
520
732
const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
733
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo (F);
521
734
522
735
AssumptionCache &AC =
523
736
getAnalysis<AssumptionCacheTracker>().getAssumptionCache (F);
524
737
UniformityInfo &UI =
525
738
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo ();
526
739
527
- return AMDGPULateCodeGenPrepare (F, ST, &AC, UI).run ();
740
+ return AMDGPULateCodeGenPrepare (F, ST, TTI, &AC, UI).run ();
528
741
}
529
742
530
743
INITIALIZE_PASS_BEGIN (AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
0 commit comments