@@ -154,10 +154,12 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
154
154
multiclass MadFmaMixPats<SDPatternOperator fma_like,
155
155
Instruction mix_inst,
156
156
Instruction mixlo_inst,
157
- Instruction mixhi_inst> {
157
+ Instruction mixhi_inst,
158
+ bit HasFP32Denormals> {
158
159
// At least one of the operands needs to be an fpextend of an f16
159
160
// for this to be worthwhile, so we need three patterns here.
160
161
// TODO: Could we use a predicate to inspect src1/2/3 instead?
162
+ let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]) in {
161
163
def : GCNPat <
162
164
(f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
163
165
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
@@ -177,6 +179,45 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
177
179
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
178
180
DSTCLAMP.NONE)>;
179
181
182
+ def : GCNPat <
183
+ (AMDGPUclamp (build_vector
184
+ (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
185
+ (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
186
+ (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
187
+ (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
188
+ (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
189
+ (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
190
+ (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
191
+ $hi_src1_modifiers, $hi_src1,
192
+ $hi_src2_modifiers, $hi_src2,
193
+ DSTCLAMP.ENABLE,
194
+ (mixlo_inst $lo_src0_modifiers, $lo_src0,
195
+ $lo_src1_modifiers, $lo_src1,
196
+ $lo_src2_modifiers, $lo_src2,
197
+ DSTCLAMP.ENABLE,
198
+ (i32 (IMPLICIT_DEF)))))
199
+ >;
200
+
201
+ def : GCNPat <
202
+ (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
203
+ (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
204
+ (mixlo_inst $src0_modifiers, $src0,
205
+ $src1_modifiers, $src1,
206
+ (i32 0), (i32 0),
207
+ DSTCLAMP.NONE,
208
+ (i32 (IMPLICIT_DEF)))
209
+ >;
210
+
211
+ def : GCNPat <
212
+ (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
213
+ (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
214
+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
215
+ $src1_modifiers, $src1,
216
+ (i32 0), (i32 0),
217
+ DSTCLAMP.NONE,
218
+ VGPR_32:$elt0))
219
+ >;
220
+
180
221
def : GCNPat <
181
222
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
182
223
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
@@ -187,10 +228,14 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
187
228
DSTCLAMP.NONE,
188
229
(i32 (IMPLICIT_DEF)))
189
230
>;
231
+ } // End OtherPredicates
190
232
191
233
// FIXME: Special case handling for maxhi (especially for clamp)
192
234
// because dealing with the write to high half of the register is
193
235
// difficult.
236
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
237
+ let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]), True16Predicate = p in {
238
+
194
239
def : GCNPat <
195
240
(build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
196
241
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
@@ -215,44 +260,44 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
215
260
VGPR_32:$elt0))
216
261
>;
217
262
218
- def : GCNPat <
219
- (AMDGPUclamp (build_vector
220
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
221
- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
222
- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
223
- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
224
- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
225
- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
226
- (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
227
- $hi_src1_modifiers, $hi_src1,
228
- $hi_src2_modifiers, $hi_src2,
229
- DSTCLAMP.ENABLE,
230
- (mixlo_inst $lo_src0_modifiers, $lo_src0,
231
- $lo_src1_modifiers, $lo_src1,
232
- $lo_src2_modifiers, $lo_src2,
233
- DSTCLAMP.ENABLE,
234
- (i32 (IMPLICIT_DEF)))))
235
- >;
263
+ } // end OtherPredicates
236
264
265
+ let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]), True16Predicate = UseRealTrue16Insts in {
237
266
def : GCNPat <
238
- (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
239
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
240
- (mixlo_inst $src0_modifiers, $src0,
267
+ (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
268
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
269
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1),
270
+ (v2f16 (mixlo_inst $src0_modifiers, $src0,
241
271
$src1_modifiers, $src1,
242
- (i32 0), (i32 0) ,
272
+ $src2_modifiers, $src2 ,
243
273
DSTCLAMP.NONE,
244
- (i32 ( IMPLICIT_DEF)))
274
+ (REG_SEQUENCE VGPR_32, (f16 ( IMPLICIT_DEF)), lo16, $elt1, hi16 )))
245
275
>;
246
276
247
277
def : GCNPat <
248
- (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
249
- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
278
+ (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
279
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
280
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
250
281
(v2f16 (mixhi_inst $src0_modifiers, $src0,
251
282
$src1_modifiers, $src1,
252
- (i32 0), (i32 0) ,
283
+ $src2_modifiers, $src2 ,
253
284
DSTCLAMP.NONE,
254
- VGPR_32:$elt0))
285
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
286
+ >;
287
+
288
+ def : GCNPat <
289
+ (build_vector
290
+ f16:$elt0,
291
+ (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
292
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
293
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
294
+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
295
+ $src1_modifiers, $src1,
296
+ $src2_modifiers, $src2,
297
+ DSTCLAMP.ENABLE,
298
+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
255
299
>;
300
+ } // end OtherPredicates
256
301
}
257
302
258
303
class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node,
@@ -266,7 +311,8 @@ def : MinimumMaximumByMinimum3Maximum3VOP3P<fminimum, V_PK_MINIMUM3_F16>;
266
311
def : MinimumMaximumByMinimum3Maximum3VOP3P<fmaximum, V_PK_MAXIMUM3_F16>;
267
312
}
268
313
269
- let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in {
314
+ let SubtargetPredicate = HasMadMixInsts in {
315
+ let OtherPredicates = [NoFP32Denormals] in {
270
316
271
317
// These are VOP3a-like opcodes which accept no omod.
272
318
// Size of src arguments (16/32) is controlled by op_sel.
@@ -284,9 +330,10 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
284
330
}
285
331
} // End FPDPRounding = 1
286
332
}
333
+ } // OtherPredicates = [NoFP32Denormals]
287
334
288
- defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
289
- } // End SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals]
335
+ defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16, 0 /*HasFP32Denormals*/ >;
336
+ } // End SubtargetPredicate = HasMadMixInsts
290
337
291
338
292
339
// Essentially the same as the mad_mix versions
@@ -306,7 +353,7 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
306
353
} // End FPDPRounding = 1
307
354
}
308
355
309
- defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
356
+ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16, 1 /*HasPF32Denormals*/ >;
310
357
}
311
358
312
359
// Defines patterns that extract signed 4bit from each Idx[0].
0 commit comments