Skip to content

AMDGPU: Use real copysign in fast pow #97152

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 1, 2024

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Jun 29, 2024

Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.

Copy link
Contributor Author

arsenm commented Jun 29, 2024

This stack of pull requests is managed by Graphite. Learn more about stacking.

Join @arsenm and the rest of your teammates on Graphite Graphite

@llvmbot
Copy link
Member

llvmbot commented Jun 29, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.


Full diff: https://github.com/llvm/llvm-project/pull/97152.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp (+4-3)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll (+23-29)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll (+26-33)
  • (modified) llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll (+15-15)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 456f3cb332cf8..27fa67ce5b45a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1131,17 +1131,18 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
   if (needcopysign) {
     Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
     Type *nTy = FPOp->getType()->getWithNewType(nTyS);
-    unsigned size = nTy->getScalarSizeInBits();
     Value *opr_n = FPOp->getOperand(1);
     if (opr_n->getType()->getScalarType()->isIntegerTy())
       opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
     else
       opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
 
+    unsigned size = nTy->getScalarSizeInBits();
     Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
     sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
-    nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
-    nval = B.CreateBitCast(nval, opr0->getType());
+
+    nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
+                            nullptr, "__pow_sign");
   }
 
   LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
index 6b4b0f881f3be..ab52c8ff8d399 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
@@ -1783,7 +1783,8 @@ define float @test_pow_afn_f32_nnan_ninf__y_10(float %x) {
 define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison(<2 x float> %x) {
 ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison
 ; CHECK-SAME: (<2 x float> [[X:%.*]]) {
-; CHECK-NEXT:    ret <2 x float> poison
+; CHECK-NEXT:    [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> poison)
+; CHECK-NEXT:    ret <2 x float> [[__EXP2]]
 ;
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> poison)
   ret <2 x float> %pow
@@ -2215,10 +2216,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y)
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y.cast = sitofp i32 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2303,10 +2303,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y)
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y.cast = uitofp i32 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2352,10 +2351,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y.cast = uitofp i256 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2375,10 +2373,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256(float %x, i2
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y.cast = sitofp i256 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2398,10 +2395,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp(<2 x floa
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i32> [[TMP1]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP3]])
+; CHECK-NEXT:    ret <2 x float> [[__POW_SIGN1]]
 ;
   %y.cast = sitofp <2 x i32> %y to <2 x float>
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@@ -2447,10 +2443,9 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp(<2 x floa
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i32> [[TMP1]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP3]])
+; CHECK-NEXT:    ret <2 x float> [[__POW_SIGN1]]
 ;
   %y.cast = uitofp <2 x i32> %y to <2 x float>
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@@ -2559,10 +2554,9 @@ define float @test_pow_afn_f32_nnan_ninf__y_known_integral_trunc(float %x, float
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:    ret float [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
   %y = call float @llvm.trunc.f32(float %y.arg)
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
index 77db224af2890..8d5705d4deb4c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
@@ -679,10 +679,9 @@ define float @test_pown_afn_nnan_ninf_f32(float %x, i32 %y) {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT:    ret float [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn float @_Z4pownfi(float %x, i32 %y)
@@ -701,10 +700,9 @@ define <2 x float> @test_pown_afn_nnan_ninf_v2f32(<2 x float> %x, <2 x i32> %y)
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i32> [[Y]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint <2 x i32> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[__POW_SIGN]] to <2 x float>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x float> @llvm.copysign.v2f32(<2 x float> [[__EXP2]], <2 x float> [[TMP1]])
+; CHECK-NEXT:    ret <2 x float> [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn <2 x float> @_Z4pownDv2_fDv2_i(<2 x float> %x, <2 x i32> %y)
@@ -724,10 +722,9 @@ define double @test_pown_afn_nnan_ninf_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i64 [[__YTOU]], 63
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double [[X]] to i64
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i64 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[__EXP2]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to double
-; CHECK-NEXT:    ret double [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[__POW_SIGN]] to double
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn double @llvm.copysign.f64(double [[__EXP2]], double [[TMP1]])
+; CHECK-NEXT:    ret double [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn double @_Z4powndi(double %x, i32 %y)
@@ -747,10 +744,9 @@ define <2 x double> @test_pown_afn_nnan_ninf_v2f64(<2 x double> %x, <2 x i32> %y
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i64> [[__YTOU]], <i64 63, i64 63>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[X]] to <2 x i64>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i64> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[__EXP2]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[__POW_SIGN]] to <2 x double>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x double> @llvm.copysign.v2f64(<2 x double> [[__EXP2]], <2 x double> [[TMP1]])
+; CHECK-NEXT:    ret <2 x double> [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn <2 x double> @_Z4pownDv2_dDv2_i(<2 x double> %x, <2 x i32> %y)
@@ -770,10 +766,9 @@ define half @test_pown_afn_nnan_ninf_f16(half %x, i32 %y) {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i16 [[__YTOU]], 15
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast half [[X]] to i16
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i16 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[__EXP2]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i16 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP2]] to half
-; CHECK-NEXT:    ret half [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[__POW_SIGN]] to half
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn half @llvm.copysign.f16(half [[__EXP2]], half [[TMP1]])
+; CHECK-NEXT:    ret half [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn half @_Z4pownDhi(half %x, i32 %y)
@@ -793,10 +788,9 @@ define <2 x half> @test_pown_afn_nnan_ninf_v2f16(<2 x half> %x, <2 x i32> %y) {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl <2 x i16> [[__YTOU]], <i16 15, i16 15>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x half> [[X]] to <2 x i16>
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and <2 x i16> [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x half> [[__EXP2]] to <2 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint <2 x i16> [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x half>
-; CHECK-NEXT:    ret <2 x half> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[__POW_SIGN]] to <2 x half>
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn <2 x half> @llvm.copysign.v2f16(<2 x half> [[__EXP2]], <2 x half> [[TMP1]])
+; CHECK-NEXT:    ret <2 x half> [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call nnan ninf afn <2 x half> @_Z4pownDv2_DhDv2_i(<2 x half> %x, <2 x i32> %y)
@@ -827,10 +821,9 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 {
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT:    ret float [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call fast float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]]) #[[ATTR0]]
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y) #1
@@ -840,7 +833,8 @@ entry:
 define float @test_pown_fast_f32__y_poison(float %x) {
 ; CHECK-LABEL: define float @test_pown_fast_f32__y_poison
 ; CHECK-SAME: (float [[X:%.*]]) {
-; CHECK-NEXT:    ret float poison
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float poison)
+; CHECK-NEXT:    ret float [[__EXP2]]
 ;
   %call = tail call fast float @_Z4pownfi(float %x, i32 poison)
   ret float %call
@@ -1073,10 +1067,9 @@ define float @test_pown_afn_ninf_nnan_f32__x_known_positive(float nofpclass(ninf
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT:    ret float [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:    [[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[__POW_SIGN1]]
 ;
 entry:
   %call = tail call afn ninf nnan float @_Z4pownfi(float %x, i32 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 5a241f85b2e2c..7117f5ccc76e6 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -359,9 +359,9 @@ declare half @_Z4pownDhi(half, i32)
 ; GCN-NATIVE: %__yeven = shl i16 %__ytou, 15
 ; GCN-NATIVE: %0 = bitcast half %x to i16
 ; GCN-NATIVE: %__pow_sign = and i16 %__yeven, %0
-; GCN-NATIVE: %1 = bitcast half %__exp2 to i16
-; GCN-NATIVE: %2 = or disjoint i16 %__pow_sign, %1
-; GCN-NATIVE: %3 = bitcast i16 %2 to half
+; GCN-NATIVE: %1 = bitcast i16 %__pow_sign to half
+; GCN-NATIVE: %__pow_sign1 = tail call fast half @llvm.copysign.f16(half %__exp2, half %1)
+; GCN-NATIVE: ret half %__pow_sign1
 define half @test_pown_f16(half %x, i32 %y) {
 entry:
   %call = call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -377,9 +377,9 @@ declare float @_Z4pownfi(float, i32)
 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
 ; GCN: %[[r0:.*]] = bitcast float %tmp to i32
 ; GCN: %__pow_sign = and i32 %[[r0]], -2147483648
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r1:.+]] = bitcast i32 %__pow_sign to float
+; GCN: %[[r2:.+]] = tail call fast float @llvm.copysign.f32(float %__exp2, float %[[r1]])
+; GCN: store float %[[r2]], ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -413,9 +413,9 @@ entry:
 ; GCN: %__yeven = shl i32 %conv, 31
 ; GCN: %[[r0:.*]] = bitcast float %tmp to i32
 ; GCN: %__pow_sign = and i32 %__yeven, %[[r0]]
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r1:.*]] = bitcast i32 %__pow_sign to float
+; GCN: %[[r2:.*]] = tail call fast float @llvm.copysign.f32(float %__exp2, float %[[r1]])
+; GCN: store float %[[r2]], ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_pown(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -437,9 +437,9 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
 ; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
 ; GCN: %1 = bitcast half %x to i16
 ; GCN: %__pow_sign = and i16 %1, -32768
-; GCN: %2 = bitcast half %__exp2 to i16
-; GCN: %3 = or disjoint i16 %__pow_sign, %2
-; GCN: %4 = bitcast i16 %3 to half
+; GCN: %2 = bitcast i16 %__pow_sign to half
+; GCN: %__pow_sign1 = tail call fast half @llvm.copysign.f16(half %__exp2, half %2)
+; GCN: ret half %__pow_sign1
 define half @test_pow_fast_f16__y_13(half %x) {
   %powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
   ret half %powr
@@ -452,9 +452,9 @@ define half @test_pow_fast_f16__y_13(half %x) {
 ; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx)
 ; GCN: %1 = bitcast <2 x half> %x to <2 x i16>
 ; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
-; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
-; GCN: %3 = or disjoint <2 x i16> %__pow_sign, %2
-; GCN: %4 = bitcast <2 x i16> %3 to <2 x half>
+; GCN: %2 = bitcast <2 x i16> %__pow_sign to <2 x half>
+; GCN: %__pow_sign1 = tail call fast <2 x half> @llvm.copysign.v2f16(<2 x half> %__exp2, <2 x half> %2)
+; GCN: ret <2 x half> %__pow_sign1
 define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) {
   %powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>)
   ret <2 x half> %powr

@arsenm arsenm marked this pull request as ready for review June 29, 2024 07:50
@arsenm arsenm force-pushed the users/arsenm/dag-simplifydemanded-bits-copysign-sign-operand branch from 1cc5dca to 627e1f1 Compare July 1, 2024 10:15
Base automatically changed from users/arsenm/dag-simplifydemanded-bits-copysign-sign-operand to main July 1, 2024 10:19
Copy link
Collaborator

@rampitec rampitec left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.
@arsenm arsenm force-pushed the users/arsenm/amdgpu-libcall-fast-pow-use-copysign branch from 339a760 to 3810635 Compare July 1, 2024 18:07
@arsenm arsenm merged commit d3e7c4c into main Jul 1, 2024
4 of 6 checks passed
@arsenm arsenm deleted the users/arsenm/amdgpu-libcall-fast-pow-use-copysign branch July 1, 2024 18:16
@llvm-ci
Copy link
Collaborator

llvm-ci commented Jul 1, 2024

LLVM Buildbot has detected a new failure on builder ml-opt-dev-x86-64 running on ml-opt-dev-x86-64-b2 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/1002

Here is the relevant piece of the build log for the reference:

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /b/ml-opt-dev-x86-64-b1/build/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink < /b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll | /b/ml-opt-dev-x86-64-b1/build/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink | /b/ml-opt-dev-x86-64-b1/build/bin/FileCheck /b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+ /b/ml-opt-dev-x86-64-b1/build/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink
+ /b/ml-opt-dev-x86-64-b1/build/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink
+ /b/ml-opt-dev-x86-64-b1/build/bin/FileCheck /b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
/b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:62:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
              ^
<stdin>:76:2: note: 'next' match was here
 v_cvt_f16_f32_e32 v1, v1
 ^
<stdin>:74:24: note: previous match ended here
 v_log_f16_e64 v3, |v0|
                       ^
<stdin>:75:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
^
/b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:105:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
              ^
<stdin>:126:26: note: scanning from here
 v_mul_f32_e32 v2, v2, v3
                         ^
<stdin>:127:2: note: possible intended match here
 v_and_b32_e32 v0, v1, v0
 ^
/b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:121:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
              ^
<stdin>:152:26: note: scanning from here
 s_mov_b64 exec, s[18:19]
                         ^
<stdin>:153:2: note: possible intended match here
 v_writelane_b32 v43, s16, 15
 ^
/b/ml-opt-dev-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:357:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
              ^
<stdin>:420:2: note: 'next' match was here
 v_mul_f16_e32 v2, v3, v2
 ^
<stdin>:418:26: note: previous match ended here
 v_cvt_f16_f32_e32 v2, v2
                         ^
<stdin>:419:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
...

@llvm-ci
Copy link
Collaborator

llvm-ci commented Jul 1, 2024

LLVM Buildbot has detected a new failure on builder ml-opt-rel-x86-64 running on ml-opt-rel-x86-64-b2 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/1004

Here is the relevant piece of the build log for the reference:

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /b/ml-opt-rel-x86-64-b1/build/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink < /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll | /b/ml-opt-rel-x86-64-b1/build/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink | /b/ml-opt-rel-x86-64-b1/build/bin/FileCheck /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+ /b/ml-opt-rel-x86-64-b1/build/bin/FileCheck /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+ /b/ml-opt-rel-x86-64-b1/build/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink
+ /b/ml-opt-rel-x86-64-b1/build/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink
/b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:62:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
              ^
<stdin>:76:2: note: 'next' match was here
 v_cvt_f16_f32_e32 v1, v1
 ^
<stdin>:74:24: note: previous match ended here
 v_log_f16_e64 v3, |v0|
                       ^
<stdin>:75:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
^
/b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:105:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
              ^
<stdin>:126:26: note: scanning from here
 v_mul_f32_e32 v2, v2, v3
                         ^
<stdin>:127:2: note: possible intended match here
 v_and_b32_e32 v0, v1, v0
 ^
/b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:121:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
              ^
<stdin>:152:26: note: scanning from here
 s_mov_b64 exec, s[18:19]
                         ^
<stdin>:153:2: note: possible intended match here
 v_writelane_b32 v43, s16, 15
 ^
/b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:357:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
              ^
<stdin>:420:2: note: 'next' match was here
 v_mul_f16_e32 v2, v3, v2
 ^
<stdin>:418:26: note: previous match ended here
 v_cvt_f16_f32_e32 v2, v2
                         ^
<stdin>:419:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
...

arsenm added a commit that referenced this pull request Jul 1, 2024
@llvm-ci
Copy link
Collaborator

llvm-ci commented Jul 1, 2024

LLVM Buildbot has detected a new failure on builder llvm-clang-x86_64-expensive-checks-debian running on gribozavr4 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/1008

Here is the relevant piece of the build log for the reference:

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink < /b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll | /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink | /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/FileCheck /b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+ /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink
+ /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/FileCheck /b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+ /b/1/llvm-clang-x86_64-expensive-checks-debian/build/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink
/b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:62:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
              ^
<stdin>:76:2: note: 'next' match was here
 v_cvt_f16_f32_e32 v1, v1
 ^
<stdin>:74:24: note: previous match ended here
 v_log_f16_e64 v3, |v0|
                       ^
<stdin>:75:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
^
/b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:105:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
              ^
<stdin>:126:26: note: scanning from here
 v_mul_f32_e32 v2, v2, v3
                         ^
<stdin>:127:2: note: possible intended match here
 v_and_b32_e32 v0, v1, v0
 ^
/b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:121:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
              ^
<stdin>:152:26: note: scanning from here
 s_mov_b64 exec, s[18:19]
                         ^
<stdin>:153:2: note: possible intended match here
 v_writelane_b32 v43, s16, 15
 ^
/b/1/llvm-clang-x86_64-expensive-checks-debian/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:357:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
              ^
<stdin>:420:2: note: 'next' match was here
 v_mul_f16_e32 v2, v3, v2
 ^
<stdin>:418:26: note: previous match ended here
 v_cvt_f16_f32_e32 v2, v2
                         ^
<stdin>:419:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
...

@llvm-ci
Copy link
Collaborator

llvm-ci commented Jul 1, 2024

LLVM Buildbot has detected a new failure on builder clang-x86_64-debian-fast running on gribozavr4 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/1383

Here is the relevant piece of the build log for the reference:

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /b/1/clang-x86_64-debian-fast/llvm.obj/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink < /b/1/clang-x86_64-debian-fast/llvm.src/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll | /b/1/clang-x86_64-debian-fast/llvm.obj/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink | /b/1/clang-x86_64-debian-fast/llvm.obj/bin/FileCheck /b/1/clang-x86_64-debian-fast/llvm.src/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+ /b/1/clang-x86_64-debian-fast/llvm.obj/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink
+ /b/1/clang-x86_64-debian-fast/llvm.obj/bin/FileCheck /b/1/clang-x86_64-debian-fast/llvm.src/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+ /b/1/clang-x86_64-debian-fast/llvm.obj/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink
/b/1/clang-x86_64-debian-fast/llvm.src/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:62:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
              ^
<stdin>:76:2: note: 'next' match was here
 v_cvt_f16_f32_e32 v1, v1
 ^
<stdin>:74:24: note: previous match ended here
 v_log_f16_e64 v3, |v0|
                       ^
<stdin>:75:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
^
/b/1/clang-x86_64-debian-fast/llvm.src/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:105:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
              ^
<stdin>:126:26: note: scanning from here
 v_mul_f32_e32 v2, v2, v3
                         ^
<stdin>:127:2: note: possible intended match here
 v_and_b32_e32 v0, v1, v0
 ^
/b/1/clang-x86_64-debian-fast/llvm.src/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:121:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
              ^
<stdin>:152:26: note: scanning from here
 s_mov_b64 exec, s[18:19]
                         ^
<stdin>:153:2: note: possible intended match here
 v_writelane_b32 v43, s16, 15
 ^
/b/1/clang-x86_64-debian-fast/llvm.src/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:357:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
              ^
<stdin>:420:2: note: 'next' match was here
 v_mul_f16_e32 v2, v3, v2
 ^
<stdin>:418:26: note: previous match ended here
 v_cvt_f16_f32_e32 v2, v2
                         ^
<stdin>:419:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
...

@llvm-ci
Copy link
Collaborator

llvm-ci commented Jul 2, 2024

LLVM Buildbot has detected a new failure on builder llvm-x86_64-debian-dylib running on gribozavr4 while building llvm at step 7 "test-build-unified-tree-check-llvm".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/1438

Here is the relevant piece of the build log for the reference:

Step 7 (test-build-unified-tree-check-llvm) failure: test (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /b/1/llvm-x86_64-debian-dylib/build/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink < /b/1/llvm-x86_64-debian-dylib/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll | /b/1/llvm-x86_64-debian-dylib/build/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink | /b/1/llvm-x86_64-debian-dylib/build/bin/FileCheck /b/1/llvm-x86_64-debian-dylib/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+ /b/1/llvm-x86_64-debian-dylib/build/bin/FileCheck /b/1/llvm-x86_64-debian-dylib/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+ /b/1/llvm-x86_64-debian-dylib/build/bin/opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink
+ /b/1/llvm-x86_64-debian-dylib/build/bin/llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink
/b/1/llvm-x86_64-debian-dylib/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:62:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
              ^
<stdin>:76:2: note: 'next' match was here
 v_cvt_f16_f32_e32 v1, v1
 ^
<stdin>:74:24: note: previous match ended here
 v_log_f16_e64 v3, |v0|
                       ^
<stdin>:75:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
^
/b/1/llvm-x86_64-debian-dylib/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:105:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
              ^
<stdin>:126:26: note: scanning from here
 v_mul_f32_e32 v2, v2, v3
                         ^
<stdin>:127:2: note: possible intended match here
 v_and_b32_e32 v0, v1, v0
 ^
/b/1/llvm-x86_64-debian-dylib/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:121:15: error: CHECK-NEXT: expected string not found in input
; CHECK-NEXT: v_writelane_b32 v43, s16, 14
              ^
<stdin>:152:26: note: scanning from here
 s_mov_b64 exec, s[18:19]
                         ^
<stdin>:153:2: note: possible intended match here
 v_writelane_b32 v43, s16, 15
 ^
/b/1/llvm-x86_64-debian-dylib/llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll:357:15: error: CHECK-NEXT: is not on the line after the previous match
; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
              ^
<stdin>:420:2: note: 'next' match was here
 v_mul_f16_e32 v2, v3, v2
 ^
<stdin>:418:26: note: previous match ended here
 v_cvt_f16_f32_e32 v2, v2
                         ^
<stdin>:419:1: note: non-matching line after previous match is here
 s_movk_i32 s4, 0x7fff
...

lravenclaw pushed a commit to lravenclaw/llvm-project that referenced this pull request Jul 3, 2024
Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.
lravenclaw pushed a commit to lravenclaw/llvm-project that referenced this pull request Jul 3, 2024
kbluck pushed a commit to kbluck/llvm-project that referenced this pull request Jul 6, 2024
Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.
kbluck pushed a commit to kbluck/llvm-project that referenced this pull request Jul 6, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants