Skip to content

Commit 841a0ed

Browse files
committed
ConstantFolding: Constant fold some canonicalizes
+/-0 is obviously foldable. Other non-special, non-subnormal values are also probably OK. For denormal values, check the calling function's denormal mode. For now, don't fold denormals to the input for IEEE mode because as far as I know the langref is still pretending LLVM's float isn't IEEE. Also folds undef to 0, although NaN may make more sense. Skips folding nans and infinities, although it should be OK to fold those in a future change.
1 parent 184fbfd commit 841a0ed

File tree

5 files changed

+140
-137
lines changed

5 files changed

+140
-137
lines changed

llvm/include/llvm/IR/Type.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,25 @@ class Type {
164164
/// Return true if this is powerpc long double.
165165
bool isPPC_FP128Ty() const { return getTypeID() == PPC_FP128TyID; }
166166

167-
/// Return true if this is one of the six floating-point types
167+
/// Return true if this is a well-behaved IEEE-like type, which has a IEEE
168+
/// compatible layout as defined by isIEEE(), and does not have unnormal
169+
/// values
170+
bool isIEEELikeFPTy() const {
171+
switch (getTypeID()) {
172+
case DoubleTyID:
173+
case FloatTyID:
174+
case HalfTyID:
175+
case BFloatTyID:
176+
case FP128TyID:
177+
return true;
178+
default:
179+
return false;
180+
}
181+
}
182+
183+
/// Return true if this is one of the floating-point types
168184
bool isFloatingPointTy() const {
169-
return getTypeID() == HalfTyID || getTypeID() == BFloatTyID ||
170-
getTypeID() == FloatTyID || getTypeID() == DoubleTyID ||
171-
getTypeID() == X86_FP80TyID || getTypeID() == FP128TyID ||
185+
return isIEEELikeFPTy() || getTypeID() == X86_FP80TyID ||
172186
getTypeID() == PPC_FP128TyID;
173187
}
174188

llvm/lib/Analysis/ConstantFolding.cpp

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1626,6 +1626,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
16261626
case Intrinsic::trunc:
16271627
case Intrinsic::nearbyint:
16281628
case Intrinsic::rint:
1629+
case Intrinsic::canonicalize:
16291630
// Constrained intrinsics can be folded if FP environment is known
16301631
// to compiler.
16311632
case Intrinsic::experimental_constrained_fma:
@@ -1941,6 +1942,39 @@ getEvaluationRoundingMode(const ConstrainedFPIntrinsic *CI) {
19411942
return *ORM;
19421943
}
19431944

1945+
/// Try to constant fold llvm.canonicalize for the given caller and value.
1946+
static Constant *constantFoldCanonicalize(const Type *Ty, const CallBase *CI,
1947+
const APFloat &Src) {
1948+
// Zero, positive and negative, is always OK to fold.
1949+
if (Src.isZero())
1950+
return ConstantFP::get(CI->getContext(), Src);
1951+
1952+
if (!Ty->isIEEELikeFPTy())
1953+
return nullptr;
1954+
1955+
// Zero is always canonical and the sign must be preserved.
1956+
//
1957+
// Denorms and nans may have special encodings, but it should be OK to fold a
1958+
// totally average number.
1959+
if (Src.isNormal() || Src.isInfinity())
1960+
return ConstantFP::get(CI->getContext(), Src);
1961+
1962+
if (Src.isDenormal()) {
1963+
DenormalMode DenormMode =
1964+
CI->getFunction()->getDenormalMode(Src.getSemantics());
1965+
if (DenormMode == DenormalMode::getIEEE())
1966+
return nullptr;
1967+
1968+
bool IsPositive = !Src.isNegative() ||
1969+
DenormMode.Input == DenormalMode::PositiveZero ||
1970+
DenormMode.Output == DenormalMode::PositiveZero;
1971+
return ConstantFP::get(CI->getContext(),
1972+
APFloat::getZero(Src.getSemantics(), !IsPositive));
1973+
}
1974+
1975+
return nullptr;
1976+
}
1977+
19441978
static Constant *ConstantFoldScalarCall1(StringRef Name,
19451979
Intrinsic::ID IntrinsicID,
19461980
Type *Ty,
@@ -1957,14 +1991,22 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
19571991
return ConstantInt::getTrue(Ty->getContext());
19581992
return nullptr;
19591993
}
1994+
1995+
if (isa<PoisonValue>(Operands[0])) {
1996+
// TODO: All of these operations should probably propagate poison.
1997+
if (IntrinsicID == Intrinsic::canonicalize)
1998+
return PoisonValue::get(Ty);
1999+
}
2000+
19602001
if (isa<UndefValue>(Operands[0])) {
19612002
// cosine(arg) is between -1 and 1. cosine(invalid arg) is NaN.
19622003
// ctpop() is between 0 and bitwidth, pick 0 for undef.
19632004
// fptoui.sat and fptosi.sat can always fold to zero (for a zero input).
19642005
if (IntrinsicID == Intrinsic::cos ||
19652006
IntrinsicID == Intrinsic::ctpop ||
19662007
IntrinsicID == Intrinsic::fptoui_sat ||
1967-
IntrinsicID == Intrinsic::fptosi_sat)
2008+
IntrinsicID == Intrinsic::fptosi_sat ||
2009+
IntrinsicID == Intrinsic::canonicalize)
19682010
return Constant::getNullValue(Ty);
19692011
if (IntrinsicID == Intrinsic::bswap ||
19702012
IntrinsicID == Intrinsic::bitreverse ||
@@ -2032,6 +2074,9 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
20322074
return ConstantInt::get(Ty, Int);
20332075
}
20342076

2077+
if (IntrinsicID == Intrinsic::canonicalize)
2078+
return constantFoldCanonicalize(Ty, Call, U);
2079+
20352080
if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
20362081
return nullptr;
20372082

llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace
1515
; VI-LABEL: test_fold_canonicalize_undef_value_f16:
1616
; VI: ; %bb.0:
1717
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
18-
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
18+
; VI-NEXT: v_mov_b32_e32 v2, 0
1919
; VI-NEXT: s_waitcnt lgkmcnt(0)
2020
; VI-NEXT: v_mov_b32_e32 v0, s0
2121
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -26,17 +26,16 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace
2626
; GFX9: ; %bb.0:
2727
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2828
; GFX9-NEXT: v_mov_b32_e32 v0, 0
29-
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
3029
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
31-
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
30+
; GFX9-NEXT: global_store_short v0, v0, s[0:1]
3231
; GFX9-NEXT: s_endpgm
3332
;
3433
; CI-LABEL: test_fold_canonicalize_undef_value_f16:
3534
; CI: ; %bb.0:
3635
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3736
; CI-NEXT: s_mov_b32 s3, 0xf000
3837
; CI-NEXT: s_mov_b32 s2, -1
39-
; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
38+
; CI-NEXT: v_mov_b32_e32 v0, 0
4039
; CI-NEXT: s_waitcnt lgkmcnt(0)
4140
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
4241
; CI-NEXT: s_endpgm
@@ -1847,7 +1846,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(
18471846
; VI-LABEL: s_test_canonicalize_undef_v2f16:
18481847
; VI: ; %bb.0:
18491848
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1850-
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
1849+
; VI-NEXT: v_mov_b32_e32 v2, 0
18511850
; VI-NEXT: s_waitcnt lgkmcnt(0)
18521851
; VI-NEXT: v_mov_b32_e32 v0, s0
18531852
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1858,17 +1857,16 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(
18581857
; GFX9: ; %bb.0:
18591858
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
18601859
; GFX9-NEXT: v_mov_b32_e32 v0, 0
1861-
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
18621860
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1863-
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1861+
; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
18641862
; GFX9-NEXT: s_endpgm
18651863
;
18661864
; CI-LABEL: s_test_canonicalize_undef_v2f16:
18671865
; CI: ; %bb.0:
18681866
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
18691867
; CI-NEXT: s_mov_b32 s3, 0xf000
18701868
; CI-NEXT: s_mov_b32 s2, -1
1871-
; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00
1869+
; CI-NEXT: v_mov_b32_e32 v0, 0
18721870
; CI-NEXT: s_waitcnt lgkmcnt(0)
18731871
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
18741872
; CI-NEXT: s_endpgm
@@ -1934,19 +1932,19 @@ define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
19341932
; VI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
19351933
; VI: ; %bb.0:
19361934
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1937-
; VI-NEXT: v_mov_b32_e32 v0, 0x3c003c00
1935+
; VI-NEXT: v_bfrev_b32_e32 v0, 60
19381936
; VI-NEXT: s_setpc_b64 s[30:31]
19391937
;
19401938
; GFX9-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
19411939
; GFX9: ; %bb.0:
19421940
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1943-
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c003c00
1941+
; GFX9-NEXT: v_bfrev_b32_e32 v0, 60
19441942
; GFX9-NEXT: s_setpc_b64 s[30:31]
19451943
;
19461944
; CI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
19471945
; CI: ; %bb.0:
19481946
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1949-
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
1947+
; CI-NEXT: v_mov_b32_e32 v0, 0
19501948
; CI-NEXT: v_mov_b32_e32 v1, 1.0
19511949
; CI-NEXT: s_setpc_b64 s[30:31]
19521950
%vec = insertelement <2 x half> undef, half 1.0, i32 1
@@ -1958,20 +1956,20 @@ define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
19581956
; VI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
19591957
; VI: ; %bb.0:
19601958
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1961-
; VI-NEXT: v_mov_b32_e32 v0, 0x3c003c00
1959+
; VI-NEXT: v_mov_b32_e32 v0, 0x3c00
19621960
; VI-NEXT: s_setpc_b64 s[30:31]
19631961
;
19641962
; GFX9-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
19651963
; GFX9: ; %bb.0:
19661964
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1967-
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c003c00
1965+
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
19681966
; GFX9-NEXT: s_setpc_b64 s[30:31]
19691967
;
19701968
; CI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
19711969
; CI: ; %bb.0:
19721970
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19731971
; CI-NEXT: v_mov_b32_e32 v0, 1.0
1974-
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
1972+
; CI-NEXT: v_mov_b32_e32 v1, 0
19751973
; CI-NEXT: s_setpc_b64 s[30:31]
19761974
%vec = insertelement <2 x half> undef, half 1.0, i32 0
19771975
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -1982,19 +1980,19 @@ define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
19821980
; VI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
19831981
; VI: ; %bb.0:
19841982
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1985-
; VI-NEXT: v_mov_b32_e32 v0, 0x4c004c00
1983+
; VI-NEXT: v_bfrev_b32_e32 v0, 50
19861984
; VI-NEXT: s_setpc_b64 s[30:31]
19871985
;
19881986
; GFX9-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
19891987
; GFX9: ; %bb.0:
19901988
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1991-
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4c004c00
1989+
; GFX9-NEXT: v_bfrev_b32_e32 v0, 50
19921990
; GFX9-NEXT: s_setpc_b64 s[30:31]
19931991
;
19941992
; CI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
19951993
; CI: ; %bb.0:
19961994
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1997-
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
1995+
; CI-NEXT: v_mov_b32_e32 v0, 0
19981996
; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
19991997
; CI-NEXT: s_setpc_b64 s[30:31]
20001998
%vec = insertelement <2 x half> undef, half 16.0, i32 1
@@ -2006,20 +2004,20 @@ define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
20062004
; VI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
20072005
; VI: ; %bb.0:
20082006
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2009-
; VI-NEXT: v_mov_b32_e32 v0, 0x4c004c00
2007+
; VI-NEXT: v_mov_b32_e32 v0, 0x4c00
20102008
; VI-NEXT: s_setpc_b64 s[30:31]
20112009
;
20122010
; GFX9-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
20132011
; GFX9: ; %bb.0:
20142012
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2015-
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4c004c00
2013+
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4c00
20162014
; GFX9-NEXT: s_setpc_b64 s[30:31]
20172015
;
20182016
; CI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
20192017
; CI: ; %bb.0:
20202018
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20212019
; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
2022-
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
2020+
; CI-NEXT: v_mov_b32_e32 v1, 0
20232021
; CI-NEXT: s_setpc_b64 s[30:31]
20242022
%vec = insertelement <2 x half> undef, half 16.0, i32 0
20252023
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -2086,7 +2084,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(
20862084
; VI-LABEL: s_test_canonicalize_undef_v4f16:
20872085
; VI: ; %bb.0:
20882086
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2089-
; VI-NEXT: v_mov_b32_e32 v0, 0x7e007e00
2087+
; VI-NEXT: v_mov_b32_e32 v0, 0
20902088
; VI-NEXT: v_mov_b32_e32 v1, v0
20912089
; VI-NEXT: s_waitcnt lgkmcnt(0)
20922090
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2097,17 +2095,16 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(
20972095
; GFX9-LABEL: s_test_canonicalize_undef_v4f16:
20982096
; GFX9: ; %bb.0:
20992097
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2100-
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7e007e00
2101-
; GFX9-NEXT: v_mov_b32_e32 v2, 0
2098+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
21022099
; GFX9-NEXT: v_mov_b32_e32 v1, v0
21032100
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2104-
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
2101+
; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
21052102
; GFX9-NEXT: s_endpgm
21062103
;
21072104
; CI-LABEL: s_test_canonicalize_undef_v4f16:
21082105
; CI: ; %bb.0:
21092106
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2110-
; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00
2107+
; CI-NEXT: v_mov_b32_e32 v0, 0
21112108
; CI-NEXT: s_mov_b32 s3, 0xf000
21122109
; CI-NEXT: s_mov_b32 s2, -1
21132110
; CI-NEXT: v_mov_b32_e32 v1, v0

llvm/test/CodeGen/AMDGPU/fcanonicalize.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)*
7676
}
7777

7878
; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32:
79-
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
79+
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
8080
; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
8181
define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* %out) #1 {
8282
%canonicalized = call float @llvm.canonicalize.f32(float undef)

0 commit comments

Comments
 (0)