Skip to content

Commit 885eaf5

Browse files
choikwazhang2amd
authored andcommitted
[AMDGPU] Use correct number of bits needed for div/rem shrinking (llvm#80622)
There was an error where dividend of type i64 and actual used number of bits of 32 fell into path that assumes only 24 bits being used. Check that AtLeast field is used correctly when using computeNumSignBits and add necessary extend/trunc for 32 bits path. Regolden and update testcases. @jrbyrnes @bcahoon @arsenm @rampitec Change-Id: I07b0fe8a27b4107242121d66d9536683bcac1cc0
1 parent 9eccbc0 commit 885eaf5

File tree

11 files changed

+1142
-840
lines changed

11 files changed

+1142
-840
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -951,7 +951,10 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
951951
BinaryOperator &I, Value *Num,
952952
Value *Den, bool IsDiv,
953953
bool IsSigned) const {
954-
int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
954+
unsigned SSBits = Num->getType()->getScalarSizeInBits();
955+
// If Num bits <= 24, assume 0 signbits.
956+
unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
957+
int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
955958
if (DivBits == -1)
956959
return nullptr;
957960
return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
@@ -1123,13 +1126,13 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
11231126
Type *I32Ty = Builder.getInt32Ty();
11241127
Type *F32Ty = Builder.getFloatTy();
11251128

1126-
if (Ty->getScalarSizeInBits() < 32) {
1129+
if (Ty->getScalarSizeInBits() != 32) {
11271130
if (IsSigned) {
1128-
X = Builder.CreateSExt(X, I32Ty);
1129-
Y = Builder.CreateSExt(Y, I32Ty);
1131+
X = Builder.CreateSExtOrTrunc(X, I32Ty);
1132+
Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
11301133
} else {
1131-
X = Builder.CreateZExt(X, I32Ty);
1132-
Y = Builder.CreateZExt(Y, I32Ty);
1134+
X = Builder.CreateZExtOrTrunc(X, I32Ty);
1135+
Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
11331136
}
11341137
}
11351138

@@ -1220,10 +1223,10 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
12201223
if (IsSigned) {
12211224
Res = Builder.CreateXor(Res, Sign);
12221225
Res = Builder.CreateSub(Res, Sign);
1226+
Res = Builder.CreateSExtOrTrunc(Res, Ty);
1227+
} else {
1228+
Res = Builder.CreateZExtOrTrunc(Res, Ty);
12231229
}
1224-
1225-
Res = Builder.CreateTrunc(Res, Ty);
1226-
12271230
return Res;
12281231
}
12291232

llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll

Lines changed: 68 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3156,19 +3156,29 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
31563156
; CGP-LABEL: v_sdiv_i64_24bit:
31573157
; CGP: ; %bb.0:
31583158
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3159-
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
3160-
; CGP-NEXT: v_cvt_f32_i32_e32 v1, v1
3161-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
3162-
; CGP-NEXT: v_cvt_f32_i32_e32 v0, v0
3163-
; CGP-NEXT: v_rcp_f32_e32 v2, v1
3164-
; CGP-NEXT: v_mul_f32_e32 v2, v0, v2
3165-
; CGP-NEXT: v_trunc_f32_e32 v2, v2
3166-
; CGP-NEXT: v_mad_f32 v0, -v2, v1, v0
3167-
; CGP-NEXT: v_cvt_i32_f32_e32 v2, v2
3168-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, |v1|
3169-
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
3170-
; CGP-NEXT: v_add_i32_e32 v0, vcc, v2, v0
3171-
; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25
3159+
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
3160+
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
3161+
; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
3162+
; CGP-NEXT: v_rcp_f32_e32 v1, v1
3163+
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
3164+
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
3165+
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
3166+
; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
3167+
; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
3168+
; CGP-NEXT: v_mov_b32_e32 v0, v2
3169+
; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
3170+
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
3171+
; CGP-NEXT: v_mov_b32_e32 v0, v1
3172+
; CGP-NEXT: v_mul_lo_u32 v1, v0, v3
3173+
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0
3174+
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v5, v1
3175+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
3176+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
3177+
; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3
3178+
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
3179+
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0
3180+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
3181+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
31723182
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
31733183
; CGP-NEXT: s_setpc_b64 s[30:31]
31743184
%num.mask = and i64 %num, 16777215
@@ -3437,32 +3447,52 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
34373447
; CGP-LABEL: v_sdiv_v2i64_24bit:
34383448
; CGP: ; %bb.0:
34393449
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3440-
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4
3441-
; CGP-NEXT: v_cvt_f32_i32_e32 v1, v1
3442-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
3443-
; CGP-NEXT: v_cvt_f32_i32_e32 v0, v0
3450+
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4
3451+
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
34443452
; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6
3445-
; CGP-NEXT: v_rcp_f32_e32 v3, v1
3446-
; CGP-NEXT: v_cvt_f32_i32_e32 v4, v4
3447-
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
3448-
; CGP-NEXT: v_cvt_f32_i32_e32 v2, v2
3449-
; CGP-NEXT: v_mul_f32_e32 v3, v0, v3
3450-
; CGP-NEXT: v_trunc_f32_e32 v3, v3
3451-
; CGP-NEXT: v_mad_f32 v0, -v3, v1, v0
3452-
; CGP-NEXT: v_cvt_i32_f32_e32 v3, v3
3453-
; CGP-NEXT: v_rcp_f32_e32 v5, v4
3454-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, |v1|
3455-
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
3456-
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
3457-
; CGP-NEXT: v_mul_f32_e32 v3, v2, v5
3458-
; CGP-NEXT: v_trunc_f32_e32 v3, v3
3459-
; CGP-NEXT: v_mad_f32 v2, -v3, v4, v2
3460-
; CGP-NEXT: v_cvt_i32_f32_e32 v3, v3
3461-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v4|
3462-
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
3463-
; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25
3464-
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
3465-
; CGP-NEXT: v_bfe_i32 v2, v2, 0, 25
3453+
; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
3454+
; CGP-NEXT: v_rcp_f32_e32 v1, v1
3455+
; CGP-NEXT: v_and_b32_e32 v7, 0xffffff, v0
3456+
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
3457+
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1
3458+
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4
3459+
; CGP-NEXT: v_mul_lo_u32 v6, v6, v5
3460+
; CGP-NEXT: v_rcp_f32_e32 v8, v1
3461+
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
3462+
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8
3463+
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v0
3464+
; CGP-NEXT: v_mov_b32_e32 v0, v1
3465+
; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0
3466+
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0
3467+
; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
3468+
; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v2
3469+
; CGP-NEXT: v_mul_lo_u32 v2, v0, v8
3470+
; CGP-NEXT: v_mov_b32_e32 v0, v1
3471+
; CGP-NEXT: v_mul_lo_u32 v1, v0, v3
3472+
; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v0
3473+
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v1
3474+
; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v2, 0
3475+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
3476+
; CGP-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc
3477+
; CGP-NEXT: v_mov_b32_e32 v0, v2
3478+
; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v8, v0
3479+
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
3480+
; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v7, v3
3481+
; CGP-NEXT: v_mov_b32_e32 v2, v1
3482+
; CGP-NEXT: v_mul_lo_u32 v8, v2, v4
3483+
; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc
3484+
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
3485+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
3486+
; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
3487+
; CGP-NEXT: v_sub_i32_e32 v3, vcc, v6, v8
3488+
; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
3489+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
3490+
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
3491+
; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v4
3492+
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
3493+
; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
3494+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
3495+
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
34663496
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
34673497
; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2
34683498
; CGP-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll

Lines changed: 62 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3088,21 +3088,27 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
30883088
; CGP-LABEL: v_srem_i64_24bit:
30893089
; CGP: ; %bb.0:
30903090
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3091-
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
3092-
; CGP-NEXT: v_cvt_f32_i32_e32 v2, v1
3093-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
3094-
; CGP-NEXT: v_cvt_f32_i32_e32 v3, v0
3095-
; CGP-NEXT: v_rcp_f32_e32 v4, v2
3096-
; CGP-NEXT: v_mul_f32_e32 v4, v3, v4
3097-
; CGP-NEXT: v_trunc_f32_e32 v4, v4
3098-
; CGP-NEXT: v_mad_f32 v3, -v4, v2, v3
3099-
; CGP-NEXT: v_cvt_i32_f32_e32 v4, v4
3100-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
3101-
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
3102-
; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
3103-
; CGP-NEXT: v_mul_lo_u32 v1, v2, v1
3104-
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
3105-
; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25
3091+
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
3092+
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
3093+
; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
3094+
; CGP-NEXT: v_rcp_f32_e32 v1, v1
3095+
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
3096+
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
3097+
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
3098+
; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
3099+
; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
3100+
; CGP-NEXT: v_mov_b32_e32 v0, v2
3101+
; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
3102+
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
3103+
; CGP-NEXT: v_mov_b32_e32 v0, v1
3104+
; CGP-NEXT: v_mul_lo_u32 v0, v0, v3
3105+
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
3106+
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3
3107+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
3108+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
3109+
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3
3110+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
3111+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
31063112
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
31073113
; CGP-NEXT: s_setpc_b64 s[30:31]
31083114
%num.mask = and i64 %num, 16777215
@@ -3370,37 +3376,49 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
33703376
; CGP-LABEL: v_srem_v2i64_24bit:
33713377
; CGP: ; %bb.0:
33723378
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3373-
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4
3374-
; CGP-NEXT: v_cvt_f32_i32_e32 v3, v1
3375-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
3376-
; CGP-NEXT: v_cvt_f32_i32_e32 v4, v0
3377-
; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6
3378-
; CGP-NEXT: v_rcp_f32_e32 v5, v3
3379+
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4
3380+
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
3381+
; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6
3382+
; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
3383+
; CGP-NEXT: v_rcp_f32_e32 v1, v1
3384+
; CGP-NEXT: v_and_b32_e32 v7, 0xffffff, v0
33793385
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
3380-
; CGP-NEXT: v_mul_f32_e32 v5, v4, v5
3381-
; CGP-NEXT: v_trunc_f32_e32 v5, v5
3382-
; CGP-NEXT: v_mad_f32 v4, -v5, v3, v4
3383-
; CGP-NEXT: v_cvt_i32_f32_e32 v5, v5
3384-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
3385-
; CGP-NEXT: v_cvt_f32_i32_e32 v4, v6
3386-
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
3387-
; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
3388-
; CGP-NEXT: v_mul_lo_u32 v1, v3, v1
3389-
; CGP-NEXT: v_cvt_f32_i32_e32 v3, v2
3390-
; CGP-NEXT: v_rcp_f32_e32 v5, v4
3391-
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
3392-
; CGP-NEXT: v_mul_f32_e32 v1, v3, v5
3393-
; CGP-NEXT: v_trunc_f32_e32 v1, v1
3394-
; CGP-NEXT: v_mad_f32 v3, -v1, v4, v3
3395-
; CGP-NEXT: v_cvt_i32_f32_e32 v1, v1
3396-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v4|
3397-
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
3398-
; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25
3399-
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3
3400-
; CGP-NEXT: v_mul_lo_u32 v3, v1, v6
3386+
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
3387+
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1
3388+
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4
3389+
; CGP-NEXT: v_mul_lo_u32 v6, v6, v5
3390+
; CGP-NEXT: v_rcp_f32_e32 v8, v1
3391+
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
3392+
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8
3393+
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
3394+
; CGP-NEXT: v_mov_b32_e32 v0, v1
3395+
; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0
3396+
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0
3397+
; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
3398+
; CGP-NEXT: v_mul_lo_u32 v5, v0, v6
3399+
; CGP-NEXT: v_mov_b32_e32 v0, v1
3400+
; CGP-NEXT: v_mul_lo_u32 v8, v0, v3
3401+
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
3402+
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v7, v8
3403+
; CGP-NEXT: v_mov_b32_e32 v0, v1
3404+
; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v0
3405+
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
3406+
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3
3407+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
3408+
; CGP-NEXT: v_mov_b32_e32 v0, v1
3409+
; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
3410+
; CGP-NEXT: v_mul_lo_u32 v7, v0, v4
3411+
; CGP-NEXT: v_sub_i32_e32 v6, vcc, v5, v3
3412+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
3413+
; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
3414+
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
3415+
; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4
3416+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
3417+
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
3418+
; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4
3419+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
3420+
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
34013421
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
3402-
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
3403-
; CGP-NEXT: v_bfe_i32 v2, v2, 0, 25
34043422
; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2
34053423
; CGP-NEXT: s_setpc_b64 s[30:31]
34063424
%num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215>

llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll

Lines changed: 31 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -415,25 +415,17 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
415415
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
416416
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
417417
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
418-
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
419-
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
420-
; CGP-NEXT: v_rcp_f32_e32 v2, v2
421-
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
418+
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0
419+
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1
420+
; CGP-NEXT: v_rcp_f32_e32 v2, v1
421+
; CGP-NEXT: v_mul_f32_e32 v2, v0, v2
422+
; CGP-NEXT: v_trunc_f32_e32 v2, v2
423+
; CGP-NEXT: v_fma_f32 v0, -v2, v1, v0
422424
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
423-
; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
424-
; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
425-
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
426-
; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
427-
; CGP-NEXT: v_mul_lo_u32 v3, v2, v1
428-
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
429-
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
430-
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
431-
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
432-
; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
433-
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
434-
; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v2
435-
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
436-
; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
425+
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1
426+
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
427+
; CGP-NEXT: v_add_i32_e32 v0, vcc, v2, v0
428+
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
437429
; CGP-NEXT: s_setpc_b64 s[30:31]
438430
%num.mask = and i32 %num, 16777215
439431
%den.mask = and i32 %den, 16777215
@@ -496,44 +488,28 @@ define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
496488
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
497489
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
498490
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
499-
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
500-
; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
501-
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
502-
; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
503-
; CGP-NEXT: v_rcp_f32_e32 v4, v4
504-
; CGP-NEXT: v_rcp_f32_e32 v6, v6
505-
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
506-
; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
491+
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0
492+
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2
493+
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1
494+
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3
495+
; CGP-NEXT: v_rcp_f32_e32 v4, v2
496+
; CGP-NEXT: v_rcp_f32_e32 v5, v3
497+
; CGP-NEXT: v_mul_f32_e32 v4, v0, v4
498+
; CGP-NEXT: v_mul_f32_e32 v5, v1, v5
499+
; CGP-NEXT: v_trunc_f32_e32 v4, v4
500+
; CGP-NEXT: v_trunc_f32_e32 v5, v5
501+
; CGP-NEXT: v_fma_f32 v0, -v4, v2, v0
507502
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
508-
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
509-
; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
510-
; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
511-
; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
512-
; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
513-
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
514-
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
515-
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
516-
; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
517-
; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
518-
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
519-
; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
520-
; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
521-
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
522-
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
523-
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
524-
; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
525-
; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
526-
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
527-
; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
528-
; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
529-
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
530-
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
531-
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
532-
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
533-
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
534-
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
535-
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
536-
; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
503+
; CGP-NEXT: v_fma_f32 v1, -v5, v3, v1
504+
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
505+
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v2
506+
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
507+
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, v3
508+
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
509+
; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
510+
; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
511+
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
512+
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
537513
; CGP-NEXT: s_setpc_b64 s[30:31]
538514
%num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
539515
%den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>

0 commit comments

Comments
 (0)