-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Use real copysign in fast pow #97152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPreviously this would introduce some codegen regressions, but Full diff: https://github.com/llvm/llvm-project/pull/97152.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 456f3cb332cf8..27fa67ce5b45a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1131,17 +1131,18 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
if (needcopysign) {
Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
Type *nTy = FPOp->getType()->getWithNewType(nTyS);
- unsigned size = nTy->getScalarSizeInBits();
Value *opr_n = FPOp->getOperand(1);
if (opr_n->getType()->getScalarType()->isIntegerTy())
opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
else
opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
+ unsigned size = nTy->getScalarSizeInBits();
Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
- nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
- nval = B.CreateBitCast(nval, opr0->getType());
+
+ nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
+ nullptr, "__pow_sign");
}
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
index 6b4b0f881f3be..ab52c8ff8d399 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
@@ -1783,7 +1783,8 @@ define float @test_pow_afn_f32_nnan_ninf__y_10(float %x) {
define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison(<2 x float> %x) {
; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison
; CHECK-SAME: (<2 x float> [[X:%.*]]) {
-; CHECK-NEXT: ret <2 x float> poison
+; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> poison)
+; CHECK-NEXT: ret <2 x float> [[__EXP2]]
;
%pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> poison)
ret <2 x float> %pow
@@ -2215,10 +2216,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y)
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y.cast = sitofp i32 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2303,10 +2303,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y)
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y.cast = uitofp i32 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2352,10 +2351,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y.cast = uitofp i256 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2375,10 +2373,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256(float %x, i2
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y.cast = sitofp i256 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2398,10 +2395,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp(<2 x floa
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[TMP1]], <i32 31, i32 31>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float>
-; CHECK-NEXT: ret <2 x float> [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP3]])
+; CHECK-NEXT: ret <2 x float> [[__POW_SIGN1]]
;
%y.cast = sitofp <2 x i32> %y to <2 x float>
%pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@@ -2447,10 +2443,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp(<2 x floa
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[TMP1]], <i32 31, i32 31>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float>
-; CHECK-NEXT: ret <2 x float> [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP3]])
+; CHECK-NEXT: ret <2 x float> [[__POW_SIGN1]]
;
%y.cast = uitofp <2 x i32> %y to <2 x float>
%pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@@ -2559,10 +2554,9 @@ define float @test_pow_afn_f32_nnan_ninf__y_known_integral_trunc(float %x, float
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT: ret float [[TMP5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
%y = call float @llvm.trunc.f32(float %y.arg)
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
index 77db224af2890..8d5705d4deb4c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
@@ -679,10 +679,9 @@ define float @test_pown_afn_nnan_ninf_f32(float %x, i32 %y) {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT: ret float [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn float @_Z4pownfi(float %x, i32 %y)
@@ -701,10 +700,9 @@ define <2 x float> @test_pown_afn_nnan_ninf_v2f32(<2 x float> %x, <2 x i32> %y)
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[Y]], <i32 31, i32 31>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float>
-; CHECK-NEXT: ret <2 x float> [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP1]])
+; CHECK-NEXT: ret <2 x float> [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn <2 x float> @_Z4pownDv2_fDv2_i(<2 x float> %x, <2 x i32> %y)
@@ -724,10 +722,9 @@ define double @test_pown_afn_nnan_ninf_f64(double %x, i32 %y) {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i64 [[__YTOU]], 63
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double [[X]] to i64
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i64 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast double [[__EXP2]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[TMP2]] to double
-; CHECK-NEXT: ret double [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[__POW_SIGN]] to double
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn double @llvm.copysign.f64(double [[__EXP2]], double [[TMP1]])
+; CHECK-NEXT: ret double [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn double @_Z4powndi(double %x, i32 %y)
@@ -747,10 +744,9 @@ define <2 x double> @test_pown_afn_nnan_ninf_v2f64(<2 x double> %x, <2 x i32> %y
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i64> [[__YTOU]], <i64 63, i64 63>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[X]] to <2 x i64>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i64> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[__EXP2]] to <2 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i64> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <2 x double>
-; CHECK-NEXT: ret <2 x double> [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[__POW_SIGN]] to <2 x double>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x double> @llvm.copysign.v2f64(<2 x double> [[__EXP2]], <2 x double> [[TMP1]])
+; CHECK-NEXT: ret <2 x double> [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn <2 x double> @_Z4pownDv2_dDv2_i(<2 x double> %x, <2 x i32> %y)
@@ -770,10 +766,9 @@ define half @test_pown_afn_nnan_ninf_f16(half %x, i32 %y) {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i16 [[__YTOU]], 15
; CHECK-NEXT: [[TMP0:%.*]] = bitcast half [[X]] to i16
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i16 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[__EXP2]] to i16
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i16 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[TMP2]] to half
-; CHECK-NEXT: ret half [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[__POW_SIGN]] to half
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn half @llvm.copysign.f16(half [[__EXP2]], half [[TMP1]])
+; CHECK-NEXT: ret half [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn half @_Z4pownDhi(half %x, i32 %y)
@@ -793,10 +788,9 @@ define <2 x half> @test_pown_afn_nnan_ninf_v2f16(<2 x half> %x, <2 x i32> %y) {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i16> [[__YTOU]], <i16 15, i16 15>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x half> [[X]] to <2 x i16>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i16> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x half> [[__EXP2]] to <2 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint <2 x i16> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x half>
-; CHECK-NEXT: ret <2 x half> [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[__POW_SIGN]] to <2 x half>
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x half> @llvm.copysign.v2f16(<2 x half> [[__EXP2]], <2 x half> [[TMP1]])
+; CHECK-NEXT: ret <2 x half> [[__POW_SIGN1]]
;
entry:
%call = tail call nnan ninf afn <2 x half> @_Z4pownDv2_DhDv2_i(<2 x half> %x, <2 x i32> %y)
@@ -827,10 +821,9 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 {
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT: ret float [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call fast float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]]) #[[ATTR0]]
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
entry:
%call = tail call fast float @_Z4pownfi(float %x, i32 %y) #1
@@ -840,7 +833,8 @@ entry:
define float @test_pown_fast_f32__y_poison(float %x) {
; CHECK-LABEL: define float @test_pown_fast_f32__y_poison
; CHECK-SAME: (float [[X:%.*]]) {
-; CHECK-NEXT: ret float poison
+; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float poison)
+; CHECK-NEXT: ret float [[__EXP2]]
;
%call = tail call fast float @_Z4pownfi(float %x, i32 poison)
ret float %call
@@ -1073,10 +1067,9 @@ define float @test_pown_afn_ninf_nnan_f32__x_known_positive(float nofpclass(ninf
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT: ret float [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT: [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]])
+; CHECK-NEXT: ret float [[__POW_SIGN1]]
;
entry:
%call = tail call afn ninf nnan float @_Z4pownfi(float %x, i32 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 5a241f85b2e2c..7117f5ccc76e6 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -359,9 +359,9 @@ declare half @_Z4pownDhi(half, i32)
; GCN-NATIVE: %__yeven = shl i16 %__ytou, 15
; GCN-NATIVE: %0 = bitcast half %x to i16
; GCN-NATIVE: %__pow_sign = and i16 %__yeven, %0
-; GCN-NATIVE: %1 = bitcast half %__exp2 to i16
-; GCN-NATIVE: %2 = or disjoint i16 %__pow_sign, %1
-; GCN-NATIVE: %3 = bitcast i16 %2 to half
+; GCN-NATIVE: %1 = bitcast i16 %__pow_sign to half
+; GCN-NATIVE: %__pow_sign1 = tail call fast half @llvm.copysign.f16(half %__exp2, half %1)
+; GCN-NATIVE: ret half %__pow_sign1
define half @test_pown_f16(half %x, i32 %y) {
entry:
%call = call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -377,9 +377,9 @@ declare float @_Z4pownfi(float, i32)
; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
; GCN: %[[r0:.*]] = bitcast float %tmp to i32
; GCN: %__pow_sign = and i32 %[[r0]], -2147483648
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r1:.+]] = bitcast i32 %__pow_sign to float
+; GCN: %[[r2:.+]] = tail call fast float @llvm.copysign.f32(float %__exp2, float %[[r1]])
+; GCN: store float %[[r2]], ptr addrspace(1) %a, align 4
define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
entry:
%tmp = load float, ptr addrspace(1) %a, align 4
@@ -413,9 +413,9 @@ entry:
; GCN: %__yeven = shl i32 %conv, 31
; GCN: %[[r0:.*]] = bitcast float %tmp to i32
; GCN: %__pow_sign = and i32 %__yeven, %[[r0]]
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r1:.*]] = bitcast i32 %__pow_sign to float
+; GCN: %[[r2:.*]] = tail call fast float @llvm.copysign.f32(float %__exp2, float %[[r1]])
+; GCN: store float %[[r2]], ptr addrspace(1) %a, align 4
define amdgpu_kernel void @test_pown(ptr addrspace(1) nocapture %a) {
entry:
%tmp = load float, ptr addrspace(1) %a, align 4
@@ -437,9 +437,9 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
; GCN: %1 = bitcast half %x to i16
; GCN: %__pow_sign = and i16 %1, -32768
-; GCN: %2 = bitcast half %__exp2 to i16
-; GCN: %3 = or disjoint i16 %__pow_sign, %2
-; GCN: %4 = bitcast i16 %3 to half
+; GCN: %2 = bitcast i16 %__pow_sign to half
+; GCN: %__pow_sign1 = tail call fast half @llvm.copysign.f16(half %__exp2, half %2)
+; GCN: ret half %__pow_sign1
define half @test_pow_fast_f16__y_13(half %x) {
%powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
ret half %powr
@@ -452,9 +452,9 @@ define half @test_pow_fast_f16__y_13(half %x) {
; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx)
; GCN: %1 = bitcast <2 x half> %x to <2 x i16>
; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
-; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
-; GCN: %3 = or disjoint <2 x i16> %__pow_sign, %2
-; GCN: %4 = bitcast <2 x i16> %3 to <2 x half>
+; GCN: %2 = bitcast <2 x i16> %__pow_sign to <2 x half>
+; GCN: %__pow_sign1 = tail call fast <2 x half> @llvm.copysign.v2f16(<2 x half> %__exp2, <2 x half> %2)
+; GCN: ret <2 x half> %__pow_sign1
define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) {
%powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>)
ret <2 x half> %powr
|
1cc5dca
to
627e1f1
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Previously this would introduce some codegen regressions, but those have been avoided by simplifying demanded bits on copysign operations.
339a760
to
3810635
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/1002 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/1004 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/1008 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/1383 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/1438 Here is the relevant piece of the build log for the reference:
|
Previously this would introduce some codegen regressions, but those have been avoided by simplifying demanded bits on copysign operations.
This reverts commit d3e7c4c.
Previously this would introduce some codegen regressions, but those have been avoided by simplifying demanded bits on copysign operations.
This reverts commit d3e7c4c.
Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.