Skip to content

Commit 27fe841

Browse files
committed
AMDGPU: Refine rcp/rsq intrinsic folding for modern FP rules
We have to assume undef could be an snan, which would need quieting so returning qnan is safer than undef. Also consider strictfp, and don't care if the result rounded.
1 parent 1d96dca commit 27fe841

File tree

2 files changed

+38
-20
lines changed

2 files changed

+38
-20
lines changed

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3500,18 +3500,25 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
35003500
Value *Src = II->getArgOperand(0);
35013501

35023502
// TODO: Move to ConstantFolding/InstSimplify?
3503-
if (isa<UndefValue>(Src))
3504-
return replaceInstUsesWith(CI, Src);
3503+
if (isa<UndefValue>(Src)) {
3504+
Type *Ty = II->getType();
3505+
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
3506+
return replaceInstUsesWith(CI, QNaN);
3507+
}
3508+
3509+
if (II->isStrictFP())
3510+
break;
35053511

35063512
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
35073513
const APFloat &ArgVal = C->getValueAPF();
35083514
APFloat Val(ArgVal.getSemantics(), 1);
3509-
APFloat::opStatus Status = Val.divide(ArgVal,
3510-
APFloat::rmNearestTiesToEven);
3511-
// Only do this if it was exact and therefore not dependent on the
3512-
// rounding mode.
3513-
if (Status == APFloat::opOK)
3514-
return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
3515+
Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
3516+
3517+
// This is more precise than the instruction may give.
3518+
//
3519+
// TODO: The instruction always flushes denormal results (except for f16),
3520+
// should this also?
3521+
return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
35153522
}
35163523

35173524
break;
@@ -3520,8 +3527,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
35203527
Value *Src = II->getArgOperand(0);
35213528

35223529
// TODO: Move to ConstantFolding/InstSimplify?
3523-
if (isa<UndefValue>(Src))
3524-
return replaceInstUsesWith(CI, Src);
3530+
if (isa<UndefValue>(Src)) {
3531+
Type *Ty = II->getType();
3532+
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
3533+
return replaceInstUsesWith(CI, QNaN);
3534+
}
3535+
35253536
break;
35263537
}
35273538
case Intrinsic::amdgcn_frexp_mant:

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone
1010

1111
define float @test_constant_fold_rcp_f32_undef() nounwind {
1212
; CHECK-LABEL: @test_constant_fold_rcp_f32_undef(
13-
; CHECK-NEXT: ret float undef
13+
; CHECK-NEXT: ret float 0x7FF8000000000000
1414
;
1515
%val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone
1616
ret float %val
@@ -50,22 +50,29 @@ define double @test_constant_fold_rcp_f64_half() nounwind {
5050

5151
define float @test_constant_fold_rcp_f32_43() nounwind {
5252
; CHECK-LABEL: @test_constant_fold_rcp_f32_43(
53-
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01)
54-
; CHECK-NEXT: ret float [[VAL]]
53+
; CHECK-NEXT: ret float 0x3F97D05F40000000
5554
;
5655
%val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone
5756
ret float %val
5857
}
5958

6059
define double @test_constant_fold_rcp_f64_43() nounwind {
6160
; CHECK-LABEL: @test_constant_fold_rcp_f64_43(
62-
; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01)
63-
; CHECK-NEXT: ret double [[VAL]]
61+
; CHECK-NEXT: ret double 0x3F97D05F417D05F4
6462
;
6563
%val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone
6664
ret double %val
6765
}
6866

67+
define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp {
68+
; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp(
69+
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #7
70+
; CHECK-NEXT: ret float [[VAL]]
71+
;
72+
%val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone
73+
ret float %val
74+
}
75+
6976
; --------------------------------------------------------------------
7077
; llvm.amdgcn.rsq
7178
; --------------------------------------------------------------------
@@ -74,7 +81,7 @@ declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone
7481

7582
define float @test_constant_fold_rsq_f32_undef() nounwind {
7683
; CHECK-LABEL: @test_constant_fold_rsq_f32_undef(
77-
; CHECK-NEXT: ret float undef
84+
; CHECK-NEXT: ret float 0x7FF8000000000000
7885
;
7986
%val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone
8087
ret float %val
@@ -2387,8 +2394,8 @@ declare i32 @llvm.amdgcn.ballot.i32(i1) nounwind readnone convergent
23872394

23882395
define i64 @ballot_nocombine_64(i1 %i) {
23892396
; CHECK-LABEL: @ballot_nocombine_64(
2390-
; CHECK-NEXT: %b = call i64 @llvm.amdgcn.ballot.i64(i1 %i)
2391-
; CHECK-NEXT: ret i64 %b
2397+
; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[I:%.*]])
2398+
; CHECK-NEXT: ret i64 [[B]]
23922399
;
23932400
%b = call i64 @llvm.amdgcn.ballot.i64(i1 %i)
23942401
ret i64 %b
@@ -2413,8 +2420,8 @@ define i64 @ballot_one_64() {
24132420

24142421
define i32 @ballot_nocombine_32(i1 %i) {
24152422
; CHECK-LABEL: @ballot_nocombine_32(
2416-
; CHECK-NEXT: %b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)
2417-
; CHECK-NEXT: ret i32 %b
2423+
; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I:%.*]])
2424+
; CHECK-NEXT: ret i32 [[B]]
24182425
;
24192426
%b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)
24202427
ret i32 %b

0 commit comments

Comments
 (0)