@@ -268,16 +268,12 @@ multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
268
268
}
269
269
}
270
270
271
- // Template for instructions which take three fp64 or fp32 args. The
272
- // instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
271
+ // Template for minimum/maximum instructions.
273
272
//
274
273
// Also defines ftz (flush subnormal inputs and results to sign-preserving
275
274
// zero) variants for fp32 functions.
276
- //
277
- // This multiclass should be used for nodes that cannot be folded into FMAs.
278
- // For nodes that can be folded into FMAs (i.e. adds and muls), use
279
- // F3_fma_component.
280
- multiclass F3<string OpcStr, SDNode OpNode> {
275
+ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
276
+ if !not(NaN) then {
281
277
def f64rr :
282
278
NVPTXInst<(outs Float64Regs:$dst),
283
279
(ins Float64Regs:$a, Float64Regs:$b),
@@ -288,6 +284,7 @@ multiclass F3<string OpcStr, SDNode OpNode> {
288
284
(ins Float64Regs:$a, f64imm:$b),
289
285
!strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
290
286
[(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
287
+ }
291
288
def f32rr_ftz :
292
289
NVPTXInst<(outs Float32Regs:$dst),
293
290
(ins Float32Regs:$a, Float32Regs:$b),
@@ -322,45 +319,45 @@ multiclass F3<string OpcStr, SDNode OpNode> {
322
319
(ins Int16Regs:$a, Int16Regs:$b),
323
320
!strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
324
321
[(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
325
- Requires<[useFP16Math]>;
322
+ Requires<[useFP16Math, hasSM<80>, hasPTX<70> ]>;
326
323
327
324
def f16x2rr_ftz :
328
325
NVPTXInst<(outs Int32Regs:$dst),
329
326
(ins Int32Regs:$a, Int32Regs:$b),
330
327
!strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
331
328
[(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
332
- Requires<[useFP16Math, doF32FTZ]>;
329
+ Requires<[useFP16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>;
333
330
def f16x2rr :
334
331
NVPTXInst<(outs Int32Regs:$dst),
335
332
(ins Int32Regs:$a, Int32Regs:$b),
336
333
!strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
337
334
[(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
338
- Requires<[useFP16Math]>;
335
+ Requires<[useFP16Math, hasSM<80>, hasPTX<70> ]>;
339
336
def bf16rr_ftz :
340
337
NVPTXInst<(outs Int16Regs:$dst),
341
338
(ins Int16Regs:$a, Int16Regs:$b),
342
339
!strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
343
340
[(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
344
- Requires<[hasBF16Math, doF32FTZ]>;
341
+ Requires<[hasBF16Math, doF32FTZ, hasSM<80>, hasPTX<70> ]>;
345
342
def bf16rr :
346
343
NVPTXInst<(outs Int16Regs:$dst),
347
344
(ins Int16Regs:$a, Int16Regs:$b),
348
345
!strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
349
346
[(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
350
- Requires<[hasBF16Math]>;
347
+ Requires<[hasBF16Math, hasSM<80>, hasPTX<70> ]>;
351
348
352
349
def bf16x2rr_ftz :
353
350
NVPTXInst<(outs Int32Regs:$dst),
354
351
(ins Int32Regs:$a, Int32Regs:$b),
355
352
!strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
356
353
[(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
357
- Requires<[hasBF16Math, doF32FTZ]>;
354
+ Requires<[hasBF16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>;
358
355
def bf16x2rr :
359
356
NVPTXInst<(outs Int32Regs:$dst),
360
357
(ins Int32Regs:$a, Int32Regs:$b),
361
358
!strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
362
359
[(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
363
- Requires<[hasBF16Math]>;
360
+ Requires<[hasBF16Math, hasSM<80>, hasPTX<70> ]>;
364
361
}
365
362
366
363
// Template for instructions which take three FP args. The
@@ -1178,11 +1175,10 @@ defm FADD : F3_fma_component<"add", fadd>;
1178
1175
defm FSUB : F3_fma_component<"sub", fsub>;
1179
1176
defm FMUL : F3_fma_component<"mul", fmul>;
1180
1177
1181
- defm FMIN : F3<"min", fminnum>;
1182
- defm FMAX : F3<"max", fmaxnum>;
1183
- // Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
1184
- defm FMINNAN : F3<"min.NaN", fminimum>;
1185
- defm FMAXNAN : F3<"max.NaN", fmaximum>;
1178
+ defm FMIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>;
1179
+ defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
1180
+ defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>;
1181
+ defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>;
1186
1182
1187
1183
defm FABS : F2<"abs", fabs>;
1188
1184
defm FNEG : F2<"neg", fneg>;
0 commit comments