Skip to content

Commit 1417abe

Browse files
committed
[AMDGPU] Add new llvm.amdgcn.fma.legacy intrinsic
Differential Revision: https://reviews.llvm.org/D89558
1 parent 0c1381d commit 1417abe

File tree

8 files changed

+153
-4
lines changed

8 files changed

+153
-4
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,17 @@ def int_amdgcn_log_clamp : Intrinsic<
255255

256256
def int_amdgcn_fmul_legacy : GCCBuiltin<"__builtin_amdgcn_fmul_legacy">,
257257
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
258-
[IntrNoMem, IntrSpeculatable, IntrWillReturn]
258+
[IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
259+
>;
260+
261+
// Fused single-precision multiply-add with legacy behaviour for the multiply,
262+
// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
263+
// intended for use on subtargets that have the v_fma_legacy_f32 and/or
264+
// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and
265+
// has a completely different kind of legacy behaviour.)
266+
def int_amdgcn_fma_legacy :
267+
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
268+
[IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
259269
>;
260270

261271
def int_amdgcn_rcp : Intrinsic<

llvm/lib/Analysis/ConstantFolding.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1504,6 +1504,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
15041504
case Intrinsic::amdgcn_cubesc:
15051505
case Intrinsic::amdgcn_cubetc:
15061506
case Intrinsic::amdgcn_fmul_legacy:
1507+
case Intrinsic::amdgcn_fma_legacy:
15071508
case Intrinsic::amdgcn_fract:
15081509
case Intrinsic::amdgcn_ldexp:
15091510
case Intrinsic::amdgcn_sin:
@@ -2371,8 +2372,8 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
23712372
if (IntrinsicID == Intrinsic::amdgcn_fmul_legacy) {
23722373
const APFloat &C1 = Op1->getValueAPF();
23732374
const APFloat &C2 = Op2->getValueAPF();
2374-
// The legacy behaviour is that multiplying zero by anything, even NaN
2375-
// or infinity, gives +0.0.
2375+
// The legacy behaviour is that multiplying +/- 0.0 by anything, even
2376+
// NaN or infinity, gives +0.0.
23762377
if (C1.isZero() || C2.isZero())
23772378
return ConstantFP::getNullValue(Ty);
23782379
return ConstantFP::get(Ty->getContext(), C1 * C2);
@@ -2706,6 +2707,19 @@ static Constant *ConstantFoldScalarCall3(StringRef Name,
27062707
if (const auto *Op3 = dyn_cast<ConstantFP>(Operands[2])) {
27072708
switch (IntrinsicID) {
27082709
default: break;
2710+
case Intrinsic::amdgcn_fma_legacy: {
2711+
const APFloat &C1 = Op1->getValueAPF();
2712+
const APFloat &C2 = Op2->getValueAPF();
2713+
// The legacy behaviour is that multiplying +/- 0.0 by anything, even
2714+
// NaN or infinity, gives +0.0.
2715+
if (C1.isZero() || C2.isZero()) {
2716+
const APFloat &C3 = Op3->getValueAPF();
2717+
// It's tempting to just return C3 here, but that would give the
2718+
// wrong result if C3 was -0.0.
2719+
return ConstantFP::get(Ty->getContext(), APFloat(0.0f) + C3);
2720+
}
2721+
LLVM_FALLTHROUGH;
2722+
}
27092723
case Intrinsic::fma:
27102724
case Intrinsic::fmuladd: {
27112725
APFloat V = Op1->getValueAPF();

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
599599
case AMDGPUISD::FMIN_LEGACY:
600600
case AMDGPUISD::FMAX_LEGACY:
601601
case AMDGPUISD::FMED3:
602+
// TODO: handle llvm.amdgcn.fma.legacy
602603
return true;
603604
default:
604605
return false;
@@ -3723,6 +3724,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
37233724
}
37243725
case ISD::FMA:
37253726
case ISD::FMAD: {
3727+
// TODO: handle llvm.amdgcn.fma.legacy
37263728
if (!mayIgnoreSignedZero(N0))
37273729
return SDValue();
37283730

@@ -4713,6 +4715,12 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
47134715
case Intrinsic::amdgcn_fdot2:
47144716
// TODO: Refine on operand
47154717
return SNaN;
4718+
case Intrinsic::amdgcn_fma_legacy:
4719+
if (SNaN)
4720+
return true;
4721+
return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4722+
DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
4723+
DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
47164724
default:
47174725
return false;
47184726
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4020,6 +4020,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
40204020
case Intrinsic::amdgcn_rsq_legacy:
40214021
case Intrinsic::amdgcn_rsq_clamp:
40224022
case Intrinsic::amdgcn_fmul_legacy:
4023+
case Intrinsic::amdgcn_fma_legacy:
40234024
case Intrinsic::amdgcn_ldexp:
40244025
case Intrinsic::amdgcn_frexp_mant:
40254026
case Intrinsic::amdgcn_frexp_exp:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,17 @@ def : GCNPat <
895895
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
896896
>;
897897

898+
// Don't allow source modifiers. If there are any source modifiers then it's
899+
// better to select fma instead of fmac.
900+
let SubtargetPredicate = HasNoMadMacF32Insts in
901+
def : GCNPat <
902+
(f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0),
903+
(VOP3NoMods f32:$src1),
904+
(VOP3NoMods f32:$src2))),
905+
(V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
906+
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
907+
>;
908+
898909
let SubtargetPredicate = Has16BitInsts in {
899910
def : FMADPat <f16, V_MAC_F16_e64, fmad>;
900911
def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,9 @@ def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
298298
} // End SubtargetPredicate = HasMadMacInsts
299299

300300
let SubtargetPredicate = HasNoMadMacF32Insts in
301-
def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
301+
def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32",
302+
VOP3_Profile<VOP_F32_F32_F32_F32>,
303+
int_amdgcn_fma_legacy>;
302304
}
303305

304306
def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s
3+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s
4+
5+
define float @v_fma(float %a, float %b, float %c) {
6+
; GCN-LABEL: v_fma:
7+
; GCN: ; %bb.0:
8+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9+
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
10+
; GCN-NEXT: v_fmac_legacy_f32_e64 v2, v0, v1
11+
; GCN-NEXT: ; implicit-def: $vcc_hi
12+
; GCN-NEXT: v_mov_b32_e32 v0, v2
13+
; GCN-NEXT: s_setpc_b64 s[30:31]
14+
%fma = call float @llvm.amdgcn.fma.legacy(float %a, float %b, float %c)
15+
ret float %fma
16+
}
17+
18+
define float @v_fabs_fma(float %a, float %b, float %c) {
19+
; GCN-LABEL: v_fabs_fma:
20+
; GCN: ; %bb.0:
21+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22+
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
23+
; GCN-NEXT: v_fma_legacy_f32 v0, |v0|, v1, v2
24+
; GCN-NEXT: ; implicit-def: $vcc_hi
25+
; GCN-NEXT: s_setpc_b64 s[30:31]
26+
%fabs.a = call float @llvm.fabs.f32(float %a)
27+
%fma = call float @llvm.amdgcn.fma.legacy(float %fabs.a, float %b, float %c)
28+
ret float %fma
29+
}
30+
31+
define float @v_fneg_fabs_fma(float %a, float %b, float %c) {
32+
; GCN-LABEL: v_fneg_fabs_fma:
33+
; GCN: ; %bb.0:
34+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35+
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
36+
; GCN-NEXT: v_fma_legacy_f32 v0, v0, -|v1|, v2
37+
; GCN-NEXT: ; implicit-def: $vcc_hi
38+
; GCN-NEXT: s_setpc_b64 s[30:31]
39+
%fabs.b = call float @llvm.fabs.f32(float %b)
40+
%neg.fabs.b = fneg float %fabs.b
41+
%fma = call float @llvm.amdgcn.fma.legacy(float %a, float %neg.fabs.b, float %c)
42+
ret float %fma
43+
}
44+
45+
define float @v_fneg_fma(float %a, float %b, float %c) {
46+
; GCN-LABEL: v_fneg_fma:
47+
; GCN: ; %bb.0:
48+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49+
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
50+
; GCN-NEXT: v_fma_legacy_f32 v0, v0, v1, -v2
51+
; GCN-NEXT: ; implicit-def: $vcc_hi
52+
; GCN-NEXT: s_setpc_b64 s[30:31]
53+
%neg.c = fneg float %c
54+
%fma = call float @llvm.amdgcn.fma.legacy(float %a, float %b, float %neg.c)
55+
ret float %fma
56+
}
57+
58+
declare float @llvm.amdgcn.fma.legacy(float, float, float)
59+
declare float @llvm.fabs.f32(float)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -instsimplify -S | FileCheck %s
3+
4+
declare float @llvm.amdgcn.fma.legacy(float, float, float)
5+
6+
define void @test(float* %p) {
7+
; CHECK-LABEL: @test(
8+
; CHECK-NEXT: store volatile float 1.000000e+01, float* [[P:%.*]], align 4
9+
; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4
10+
; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4
11+
; CHECK-NEXT: store volatile float 0.000000e+00, float* [[P]], align 4
12+
; CHECK-NEXT: store volatile float 0.000000e+00, float* [[P]], align 4
13+
; CHECK-NEXT: store volatile float 0.000000e+00, float* [[P]], align 4
14+
; CHECK-NEXT: store volatile float 0.000000e+00, float* [[P]], align 4
15+
; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4
16+
; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4
17+
; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4
18+
; CHECK-NEXT: store volatile float 4.000000e+00, float* [[P]], align 4
19+
; CHECK-NEXT: ret void
20+
;
21+
%a = call float @llvm.amdgcn.fma.legacy(float +2.0, float +3.0, float +4.0)
22+
store volatile float %a, float* %p
23+
%b = call float @llvm.amdgcn.fma.legacy(float +2.0, float +0.0, float +4.0)
24+
store volatile float %b, float* %p
25+
%c = call float @llvm.amdgcn.fma.legacy(float +2.0, float -0.0, float +4.0)
26+
store volatile float %c, float* %p
27+
%d = call float @llvm.amdgcn.fma.legacy(float +0.0, float +0.0, float -0.0)
28+
store volatile float %d, float* %p
29+
%e = call float @llvm.amdgcn.fma.legacy(float +0.0, float -0.0, float -0.0)
30+
store volatile float %e, float* %p
31+
%f = call float @llvm.amdgcn.fma.legacy(float -0.0, float +0.0, float -0.0)
32+
store volatile float %f, float* %p
33+
%g = call float @llvm.amdgcn.fma.legacy(float -0.0, float -0.0, float -0.0)
34+
store volatile float %g, float* %p
35+
%h = call float @llvm.amdgcn.fma.legacy(float +0.0, float 0x7ff0000000000000, float +4.0) ; +inf
36+
store volatile float %h, float* %p
37+
%i = call float @llvm.amdgcn.fma.legacy(float 0xfff0000000000000, float +0.0, float +4.0) ; -inf
38+
store volatile float %i, float* %p
39+
%j = call float @llvm.amdgcn.fma.legacy(float 0x7ff0001000000000, float -0.0, float +4.0) ; +nan
40+
store volatile float %j, float* %p
41+
%k = call float @llvm.amdgcn.fma.legacy(float -0.0, float 0xfff0000100000000, float +4.0) ; -nan
42+
store volatile float %k, float* %p
43+
ret void
44+
}

0 commit comments

Comments
 (0)