ConstantFolding: Constant fold some canonicalizes

arsenm · arsenm · commit 841a0edd030d · 2022-11-18T10:35:19.000-08:00
+/-0 is obviously foldable. Other non-special, non-subnormal
values are also probably OK. For denormal values, check
the calling function's denormal mode. For now, don't fold
denormals to the input for IEEE mode because as far as I know
the langref is still pretending LLVM's float isn't IEEE.

Also folds undef to 0, although NaN may make more sense. Skips
folding nans and infinities, although it should be OK to fold those
in a future change.
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
@@ -164,11 +164,25 @@ class Type {
   /// Return true if this is powerpc long double.
   bool isPPC_FP128Ty() const { return getTypeID() == PPC_FP128TyID; }
 
-  /// Return true if this is one of the six floating-point types
+  /// Return true if this is a well-behaved IEEE-like type, which has a IEEE
+  /// compatible layout as defined by isIEEE(), and does not have unnormal
+  /// values
+  bool isIEEELikeFPTy() const {
+    switch (getTypeID()) {
+    case DoubleTyID:
+    case FloatTyID:
+    case HalfTyID:
+    case BFloatTyID:
+    case FP128TyID:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  /// Return true if this is one of the floating-point types
   bool isFloatingPointTy() const {
-    return getTypeID() == HalfTyID || getTypeID() == BFloatTyID ||
-           getTypeID() == FloatTyID || getTypeID() == DoubleTyID ||
-           getTypeID() == X86_FP80TyID || getTypeID() == FP128TyID ||
+    return isIEEELikeFPTy() || getTypeID() == X86_FP80TyID ||
            getTypeID() == PPC_FP128TyID;
   }
 
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1626,6 +1626,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::trunc:
   case Intrinsic::nearbyint:
   case Intrinsic::rint:
+  case Intrinsic::canonicalize:
   // Constrained intrinsics can be folded if FP environment is known
   // to compiler.
   case Intrinsic::experimental_constrained_fma:
@@ -1941,6 +1942,39 @@ getEvaluationRoundingMode(const ConstrainedFPIntrinsic *CI) {
   return *ORM;
 }
 
+/// Try to constant fold llvm.canonicalize for the given caller and value.
+static Constant *constantFoldCanonicalize(const Type *Ty, const CallBase *CI,
+                                          const APFloat &Src) {
+  // Zero, positive and negative, is always OK to fold.
+  if (Src.isZero())
+    return ConstantFP::get(CI->getContext(), Src);
+
+  if (!Ty->isIEEELikeFPTy())
+    return nullptr;
+
+  // Zero is always canonical and the sign must be preserved.
+  //
+  // Denorms and nans may have special encodings, but it should be OK to fold a
+  // totally average number.
+  if (Src.isNormal() || Src.isInfinity())
+    return ConstantFP::get(CI->getContext(), Src);
+
+  if (Src.isDenormal()) {
+    DenormalMode DenormMode =
+        CI->getFunction()->getDenormalMode(Src.getSemantics());
+    if (DenormMode == DenormalMode::getIEEE())
+      return nullptr;
+
+    bool IsPositive = !Src.isNegative() ||
+                      DenormMode.Input == DenormalMode::PositiveZero ||
+                      DenormMode.Output == DenormalMode::PositiveZero;
+    return ConstantFP::get(CI->getContext(),
+                           APFloat::getZero(Src.getSemantics(), !IsPositive));
+  }
+
+  return nullptr;
+}
+
 static Constant *ConstantFoldScalarCall1(StringRef Name,
                                          Intrinsic::ID IntrinsicID,
                                          Type *Ty,
@@ -1957,14 +1991,22 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       return ConstantInt::getTrue(Ty->getContext());
     return nullptr;
   }
+
+  if (isa<PoisonValue>(Operands[0])) {
+    // TODO: All of these operations should probably propagate poison.
+    if (IntrinsicID == Intrinsic::canonicalize)
+      return PoisonValue::get(Ty);
+  }
+
   if (isa<UndefValue>(Operands[0])) {
     // cosine(arg) is between -1 and 1. cosine(invalid arg) is NaN.
     // ctpop() is between 0 and bitwidth, pick 0 for undef.
     // fptoui.sat and fptosi.sat can always fold to zero (for a zero input).
     if (IntrinsicID == Intrinsic::cos ||
         IntrinsicID == Intrinsic::ctpop ||
         IntrinsicID == Intrinsic::fptoui_sat ||
-        IntrinsicID == Intrinsic::fptosi_sat)
+        IntrinsicID == Intrinsic::fptosi_sat ||
+        IntrinsicID == Intrinsic::canonicalize)
       return Constant::getNullValue(Ty);
     if (IntrinsicID == Intrinsic::bswap ||
         IntrinsicID == Intrinsic::bitreverse ||
@@ -2032,6 +2074,9 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       return ConstantInt::get(Ty, Int);
     }
 
+    if (IntrinsicID == Intrinsic::canonicalize)
+      return constantFoldCanonicalize(Ty, Call, U);
+
     if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
       return nullptr;
 
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace
 ; VI-LABEL: test_fold_canonicalize_undef_value_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -26,17 +26,16 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short v0, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; CI-LABEL: test_fold_canonicalize_undef_value_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -1847,7 +1846,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(
 ; VI-LABEL: s_test_canonicalize_undef_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
+; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1858,17 +1857,16 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; CI-LABEL: s_test_canonicalize_undef_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -1934,19 +1932,19 @@ define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
 ; VI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; VI-NEXT:    v_bfrev_b32_e32 v0, 60
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v0, 60
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, 1.0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 1.0, i32 1
@@ -1958,20 +1956,20 @@ define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
 ; VI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; VI-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, 1.0
-; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 1.0, i32 0
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -1982,19 +1980,19 @@ define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
 ; VI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
+; VI-NEXT:    v_bfrev_b32_e32 v0, 50
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v0, 50
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x41800000
 ; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 16.0, i32 1
@@ -2006,20 +2004,20 @@ define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
 ; VI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4c00
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4c00
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x41800000
-; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 16.0, i32 0
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -2086,7 +2084,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(
 ; VI-LABEL: s_test_canonicalize_undef_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2097,17 +2095,16 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(
 ; GFX9-LABEL: s_test_canonicalize_undef_v4f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; CI-LABEL: s_test_canonicalize_undef_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    v_mov_b32_e32 v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float undef)
diff --git a/llvm/test/Transforms/InstSimplify/canonicalize.ll b/llvm/test/Transforms/InstSimplify/canonicalize.ll

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)*`
`76`	`76`	`}`
`77`	`77`
`78`	`78`	`; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32:`
`79`		`-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}`
	`79`	`+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}`
`80`	`80`	`; GCN: {{flat\|global}}_store_dword v{{.+}}, [[REG]]`
`81`	`81`	`define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* %out) #1 {`
`82`	`82`	`%canonicalized = call float @llvm.canonicalize.f32(float undef)`