-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Reapply "[AMDGPU] prevent shrinking udiv/urem if either operand is in… #118928
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
… (SignedMax,UnsignedMax] (llvm#116733)" This reverts commit 905e831. Handle signed and unsigned path differently in getDivNumBits. Using computeKnownBits, this rejects shrinking unsigned div/rem if operands exceed signed max since we know NumSignBits will be always 0.
@llvm/pr-subscribers-backend-amdgpu Author: choikwa (choikwa) Changes… (SignedMax,UnsignedMax] (#116733)" This reverts commit 905e831. Handle signed and unsigned path differently in getDivNumBits. Using computeKnownBits, this rejects shrinking unsigned div/rem if operands exceed signed max since we know NumSignBits will be always 0. Rebased and re-attempt after first one was reverted due to unrelated failure in LibC (should be fixed by now I'm told). Patch is 47.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118928.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 75e20c79301681..eb01fdd4292898 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1195,18 +1195,34 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
Value *Den, unsigned AtLeast,
bool IsSigned) const {
- unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
- if (LHSSignBits < AtLeast)
+ if (IsSigned) {
+ unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
+ if (RHSSignBits < AtLeast)
+ return -1;
+
+ unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
+ if (LHSSignBits < AtLeast)
+ return -1;
+
+ unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+ unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
+ return DivBits + 1; // a SignBit need to be reserved for shrinking
+ }
+
+ // All bits are used for unsigned division for Num or Den in range
+ // (SignedMax, UnsignedMax].
+ KnownBits Known = computeKnownBits(Den, DL, 0, AC, &I);
+ if (Known.isNegative() || !Known.isNonNegative())
return -1;
+ unsigned RHSSignBits = Known.countMinLeadingZeros();
- unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
- if (RHSSignBits < AtLeast)
+ Known = computeKnownBits(Num, DL, 0, AC, &I);
+ if (Known.isNegative() || !Known.isNonNegative())
return -1;
+ unsigned LHSSignBits = Known.countMinLeadingZeros();
unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
- if (IsSigned)
- ++DivBits;
return DivBits;
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b7436aeb1d5302..a0878a5d0e4b13 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -9999,3 +9999,101 @@ define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) {
%result = udiv exact <2 x i64> %num, <i64 4096, i64 1024>
ret <2 x i64> %result
}
+
+define i64 @udiv_i64_gt_smax(i8 %size) {
+; GFX6-LABEL: udiv_i64_gt_smax:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX6-NEXT: v_not_b32_e32 v1, v1
+; GFX6-NEXT: v_not_b32_e32 v0, v0
+; GFX6-NEXT: s_mov_b32 s4, 0xcccccccd
+; GFX6-NEXT: v_mul_lo_u32 v3, v1, s4
+; GFX6-NEXT: v_mul_hi_u32 v4, v0, s4
+; GFX6-NEXT: s_mov_b32 s6, 0xcccccccc
+; GFX6-NEXT: v_mul_hi_u32 v5, v1, s4
+; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6
+; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s6
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GFX6-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 3
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 3, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: udiv_i64_gt_smax:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 31
+; GFX9-NEXT: v_not_b32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-NEXT: s_mov_b32 s4, 0xcccccccd
+; GFX9-NEXT: v_ashrrev_i32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_mul_hi_u32 v0, v4, s4
+; GFX9-NEXT: v_not_b32_e32 v5, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_mov_b32 s6, 0xcccccccc
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
+; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 3, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %esize = sext i8 %size to i64
+ %minus = sub nuw nsw i64 -1, %esize
+ %div = udiv i64 %minus, 10
+ ret i64 %div
+}
+
+define i64 @udiv_i64_9divbits(i8 %size) {
+; GFX6-LABEL: udiv_i64_9divbits:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX6-NEXT: s_mov_b32 s4, 0x41200000
+; GFX6-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0
+; GFX6-NEXT: v_trunc_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GFX6-NEXT: v_mad_f32 v0, -v1, s4, v0
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, 0x1ff, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: udiv_i64_9divbits:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT: s_mov_b32 s4, 0x41200000
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0
+; GFX9-NEXT: v_trunc_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1
+; GFX9-NEXT: v_mad_f32 v0, -v1, s4, v0
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX9-NEXT: v_and_b32_e32 v0, 0x1ff, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %zextend = zext i8 %size to i64
+ %num = add nuw nsw i64 1, %zextend
+ %div = udiv i64 %num, 10
+ ret i64 %div
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 5bbea7ecf3f2d5..5dde193528aa4e 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -1021,8 +1021,116 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) {
; GFX9-LABEL: sdiv64_known32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX9-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB10_2
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0
+; GFX9-NEXT: v_sub_co_u32_e32 v11, vcc, 0, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v0, vcc
+; GFX9-NEXT: v_madmk_f32 v1, v3, 0x4f800000, v1
+; GFX9-NEXT: v_rcp_f32_e32 v1, v1
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1
+; GFX9-NEXT: v_trunc_f32_e32 v3, v3
+; GFX9-NEXT: v_madmk_f32 v1, v3, 0xcf800000, v1
+; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v3
+; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT: v_mul_lo_u32 v5, v11, v10
+; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0
+; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8
+; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0
+; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v9, v4
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v3, 0
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v8, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v13, v3
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v9, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v10, v4, vcc
+; GFX9-NEXT: v_mul_lo_u32 v5, v11, v13
+; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0
+; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, 0
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, 0
+; GFX9-NEXT: v_mul_hi_u32 v12, v1, v3
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v3, 0
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v12, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v10
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v11, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v5, 0
+; GFX9-NEXT: v_mul_hi_u32 v8, v7, v1
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v8, v3
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v1, 0
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v10, v3
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v4, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v9, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v0, v1
+; GFX9-NEXT: v_mul_lo_u32 v9, v6, v5
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v1, 0
+; GFX9-NEXT: v_add3_u32 v4, v4, v9, v8
+; GFX9-NEXT: v_sub_u32_e32 v8, v2, v4
+; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v7, v3
+; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v8, v0, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v3, v6
+; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v0
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[4:5]
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT: .LBB10_2: ; %Flow
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
+; GFX9-NEXT: s_cbranch_execz .LBB10_4
+; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v3
; GFX9-NEXT: v_sub_u32_e32 v2, 0, v3
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -1033,14 +1141,17 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) {
; GFX9-NEXT: v_mul_lo_u32 v2, v0, v3
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; GFX9-NEXT: v_sub_u32_e32 v2, v1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX9-NEXT: .LBB10_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%a.ext = ashr i64 %a, 32
%b.ext = ashr i64 %b, 32
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index a77e3c226ad267..077b9045a7dc15 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -576,77 +576,51 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-LABEL: s_test_udiv31_i64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[4:5], 0xe
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s8, s0, 1
-; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
-; GCN-NEXT: s_sub_i32 s0, 0, s8
-; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_mul_lo_u32 v1, s0, v0
+; GCN-NEXT: s_load_dword s6, s[4:5], 0xe
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshr_b32 s2, s6, 1
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s0, s0, s8
-; GCN-NEXT: s_sub_i32 s0, s2, s0
-; GCN-NEXT: s_sub_i32 s1, s0, s8
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GCN-NEXT: s_cmp_ge_u32 s0, s8
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GCN-NEXT: s_cmp_ge_u32 s0, s8
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: v_mul_f32_e32 v2, v1, v2
+; GCN-NEXT: v_trunc_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2
+; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1
+; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GCN-IR-LABEL: s_test_udiv31_i64:
; GCN-IR: ; %bb.0:
-; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe
-; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s6, -1
-; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1
-; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8
-; GCN-IR-NEXT: s_sub_i32 s0, 0, s8
-; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0
+; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1
+; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT: s_lshr_b32 s2, s6, 1
+; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GCN-IR-NEXT: s_mov_b32 s6, -1
+; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GCN-IR-NEXT: s_mov_b32 s4, s0
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-IR-NEXT: s_mov_b32 s5, s1
-; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-IR-NEXT: s_mul_i32 s0, s0, s8
-; GCN-IR-NEXT: s_sub_i32 s0, s2, s0
-; GCN-IR-NEXT: s_sub_i32 s1, s0, s8
-; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8
-; GCN-IR-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-IR-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GCN-IR-NEXT: s_cmp_ge_u32 s0, s8
-; GCN-IR-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-IR-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2
+; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2
+; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1
+; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
+; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%1 = lshr i64 %x, 33
@@ -716,199 +690,66 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64
define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 %y) {
; GCN-LABEL: s_test_udiv24_i48:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_and_b32 s0, s0, 0xff000000
-; GCN-NEXT: s_and_b32 s1, s1, 0xffff
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NEXT: s_and_b32 s6, s6, 0xff000000
-; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 24
-; GCN-NEXT: v_mac_f32_e32 v1, 0, v2
-; GCN-NEXT: v_rcp_f32_e32 v1, v1
-; GCN-NEXT: s_sub_u32 s8, 0, s0
-; GCN-NEXT: s_subb_u32 s9, 0, s1
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
+; GCN-NEXT: s_and_b32 s2, s2, 0xff000000
+; GCN-NEXT: s_and_b32 s4, s4, 0xff000000
+; GCN-NEXT: s_and_b32 s5, s5, 0xffff
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT: s_and_b32 s3, s3, 0xffff
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 24
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mul_f32_e32 v2, v1, v2
; GCN-NEXT: v_trunc_f32_e32 v2, v2
-; GCN-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
+; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: v_mul_lo_u32 v3, s8, v2
-; GCN-NEXT: v_mul_hi_u32 v4, s8, v1
-; GCN-NEXT: v_mul_lo_u32 v5, s9, v1
-; GCN-NEXT: v_mul_lo_u32 v6, s8, v1
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v3
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v6
-;...
[truncated]
|
This broke the libc on GPU bot: https://lab.llvm.org/buildbot/#/builders/73/builds/9787 |
…nd is in… (#118928)" This reverts commit 509893b. This broke the libc build again https://lab.llvm.org/buildbot/#/builders/73/builds/9787.
…and is in… (llvm#118928)" This reverts commit 254d206. +Added a fix in ExpandDivRem24 to disqualify if DivNumBits exceed 24. Original commit & msg: ce6e955. Handle signed and unsigned path differently in getDivNumBits. Using computeKnownBits, this rejects shrinking unsigned div/rem if operands exceed signed max since we know NumSignBits will be always 0.
… (SignedMax,UnsignedMax] (#116733)"
This reverts commit 905e831.
Handle signed and unsigned path differently in getDivNumBits. Using computeKnownBits, this rejects shrinking unsigned div/rem if operands exceed signed max since we know NumSignBits will be always 0.
Rebased and re-attempt after first one was reverted due to unrelated failure in LibC (should be fixed by now I'm told).