Skip to content

Commit ab3d495

Browse files
committed
PeepholeOpt: Do not add subregister indexes to reg_sequence operands
Given the rest of the pass just gives up when it needs to compose subregisters, folding a subregister extract directly into a reg_sequence is counterproductive. Later fold attempts in the function will give up on the subregister operand, preventing looking up through the reg_sequence. It may still be profitable to do these folds if we start handling the composes. There are some test regressions, but this mostly looks better.
1 parent 652ff20 commit ab3d495

29 files changed

+1776
-1859
lines changed

llvm/lib/CodeGen/PeepholeOptimizer.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,12 @@ class RegSequenceRewriter : public Rewriter {
436436
if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
437437
return false;
438438

439+
// Do not introduce new subregister uses in a reg_sequence. Until composing
440+
// subregister indices is supported while folding, we're just blocking
441+
// folding of subregister copies later in the function.
442+
if (NewSubReg)
443+
return false;
444+
439445
MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
440446
MO.setReg(NewReg);
441447
MO.setSubReg(NewSubReg);

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Lines changed: 174 additions & 174 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll

Lines changed: 113 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1667,7 +1667,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
16671667
; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1
16681668
; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0
16691669
; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0
1670-
; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1
1670+
; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1
16711671
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
16721672
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
16731673
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1679,155 +1679,155 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
16791679
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
16801680
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
16811681
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1682-
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
1683-
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, 0
1682+
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
1683+
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v6, 0
16841684
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
16851685
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
1686-
; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6
1686+
; GFX9-NEXT: v_add3_u32 v8, v3, v0, v5
16871687
; GFX9-NEXT: v_mov_b32_e32 v0, v2
1688-
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[0:1]
1689-
; GFX9-NEXT: v_mov_b32_e32 v6, s11
1690-
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v1
1691-
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[2:3]
1688+
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v8, v[0:1]
1689+
; GFX9-NEXT: v_mov_b32_e32 v5, s11
1690+
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s10, v1
1691+
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s9, v6, v[2:3]
16921692
; GFX9-NEXT: s_ashr_i32 s10, s3, 31
16931693
; GFX9-NEXT: v_mov_b32_e32 v0, 0
1694-
; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc
1695-
; GFX9-NEXT: v_sub_u32_e32 v1, s11, v2
1696-
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
1697-
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
1698-
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
1699-
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
1700-
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
1701-
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
1702-
; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v8
1703-
; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1]
1704-
; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v1, vcc
1705-
; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], 1, v5
1706-
; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], 0, v4, s[0:1]
1694+
; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v5, v3, vcc
1695+
; GFX9-NEXT: v_sub_u32_e32 v3, s11, v3
1696+
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
1697+
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
1698+
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v1
1699+
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
1700+
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
1701+
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
1702+
; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v1
1703+
; GFX9-NEXT: v_cndmask_b32_e64 v9, v4, v5, s[0:1]
1704+
; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v3, vcc
1705+
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6
1706+
; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], 0, v8, s[0:1]
17071707
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11
1708-
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
1708+
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
17091709
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10
17101710
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
17111711
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11
1712-
; GFX9-NEXT: v_cndmask_b32_e64 v13, v2, v13, s[0:1]
1713-
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v3
1712+
; GFX9-NEXT: v_cndmask_b32_e64 v13, v4, v13, s[0:1]
1713+
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v5
17141714
; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1]
17151715
; GFX9-NEXT: s_add_u32 s0, s18, s6
17161716
; GFX9-NEXT: s_addc_u32 s1, s19, s6
17171717
; GFX9-NEXT: s_add_u32 s2, s2, s10
17181718
; GFX9-NEXT: s_mov_b32 s11, s10
17191719
; GFX9-NEXT: s_addc_u32 s3, s3, s10
17201720
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
1721-
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3
1721+
; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3
17221722
; GFX9-NEXT: v_cvt_f32_u32_e32 v16, s2
1723-
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
1724-
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
1725-
; GFX9-NEXT: v_add_f32_e32 v2, v2, v16
1726-
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
1723+
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
1724+
; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
1725+
; GFX9-NEXT: v_add_f32_e32 v4, v4, v16
1726+
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4
17271727
; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v10
1728-
; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v1, vcc
1729-
; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v2
1730-
; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
1731-
; GFX9-NEXT: v_trunc_f32_e32 v17, v2
1732-
; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v17
1733-
; GFX9-NEXT: v_add_f32_e32 v1, v2, v1
1734-
; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v1
1728+
; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v3, vcc
1729+
; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v4
1730+
; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
1731+
; GFX9-NEXT: v_trunc_f32_e32 v17, v4
1732+
; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v17
1733+
; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
1734+
; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v3
17351735
; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
17361736
; GFX9-NEXT: s_sub_u32 s5, 0, s2
17371737
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
1738-
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v18, 0
1739-
; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v14, vcc
1738+
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v18, 0
1739+
; GFX9-NEXT: v_cndmask_b32_e32 v13, v5, v14, vcc
17401740
; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v17
17411741
; GFX9-NEXT: s_subb_u32 s20, 0, s3
17421742
; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc
17431743
; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
1744-
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v14, v[2:3]
1744+
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v14, v[4:5]
17451745
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
1746-
; GFX9-NEXT: v_cndmask_b32_e64 v9, v4, v12, s[0:1]
1747-
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s20, v18, v[2:3]
1748-
; GFX9-NEXT: v_mul_lo_u32 v3, v14, v1
17491746
; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v16, vcc
1750-
; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2
1751-
; GFX9-NEXT: v_mul_hi_u32 v11, v18, v1
1752-
; GFX9-NEXT: v_mul_hi_u32 v1, v14, v1
1753-
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[0:1]
1754-
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
1755-
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
1756-
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v11
1757-
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1758-
; GFX9-NEXT: v_mul_lo_u32 v11, v14, v2
1759-
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
1760-
; GFX9-NEXT: v_mul_hi_u32 v4, v18, v2
1761-
; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2
1762-
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v11, v1
1763-
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
1764-
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4
1765-
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
1766-
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
1767-
; GFX9-NEXT: v_add_u32_e32 v4, v11, v4
1768-
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1769-
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v18, v1
1770-
; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2
1771-
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s5, v11, 0
1772-
; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v2, vcc
1773-
; GFX9-NEXT: v_mov_b32_e32 v1, v4
1774-
; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
1775-
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1]
1776-
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2]
1777-
; GFX9-NEXT: v_xor_b32_e32 v8, s16, v5
1778-
; GFX9-NEXT: v_xor_b32_e32 v9, s17, v9
1779-
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s20, v11, v[1:2]
1780-
; GFX9-NEXT: v_mov_b32_e32 v10, s17
1781-
; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s16, v8
1782-
; GFX9-NEXT: v_xor_b32_e32 v5, s4, v7
1783-
; GFX9-NEXT: v_mul_lo_u32 v7, v12, v3
1784-
; GFX9-NEXT: v_mul_lo_u32 v8, v11, v4
1785-
; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v9, v10, vcc
1786-
; GFX9-NEXT: v_mul_hi_u32 v9, v11, v3
1787-
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
1747+
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v18, v[4:5]
1748+
; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v12, s[0:1]
1749+
; GFX9-NEXT: v_mul_lo_u32 v8, v14, v3
1750+
; GFX9-NEXT: v_mul_lo_u32 v9, v18, v4
1751+
; GFX9-NEXT: v_mul_hi_u32 v11, v18, v3
1752+
; GFX9-NEXT: v_mul_hi_u32 v3, v14, v3
1753+
; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, v7, s[0:1]
1754+
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9
1755+
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
1756+
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v11
17881757
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
1789-
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9
1790-
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1791-
; GFX9-NEXT: v_mul_lo_u32 v9, v12, v4
1792-
; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3
1793-
; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
1794-
; GFX9-NEXT: v_mul_hi_u32 v8, v11, v4
1795-
; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4
1796-
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3
1758+
; GFX9-NEXT: v_mul_lo_u32 v11, v14, v4
1759+
; GFX9-NEXT: v_add_u32_e32 v8, v9, v8
1760+
; GFX9-NEXT: v_mul_hi_u32 v9, v18, v4
1761+
; GFX9-NEXT: v_mul_hi_u32 v4, v14, v4
1762+
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3
1763+
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
1764+
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v9
17971765
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
17981766
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8
1767+
; GFX9-NEXT: v_add_u32_e32 v9, v11, v9
17991768
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
1800-
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7
1801-
; GFX9-NEXT: v_add_u32_e32 v8, v9, v8
1769+
; GFX9-NEXT: v_add3_u32 v4, v9, v8, v4
1770+
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v18, v3
1771+
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v14, v4, vcc
1772+
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s5, v8, 0
1773+
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v13, s[0:1]
1774+
; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v10, s[0:1]
1775+
; GFX9-NEXT: v_mov_b32_e32 v1, v4
1776+
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v9, v[1:2]
1777+
; GFX9-NEXT: v_xor_b32_e32 v11, s17, v5
1778+
; GFX9-NEXT: v_xor_b32_e32 v6, s16, v6
1779+
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s20, v8, v[1:2]
1780+
; GFX9-NEXT: v_mov_b32_e32 v12, s17
1781+
; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s16, v6
1782+
; GFX9-NEXT: v_xor_b32_e32 v5, s4, v7
1783+
; GFX9-NEXT: v_mul_lo_u32 v6, v9, v3
1784+
; GFX9-NEXT: v_mul_lo_u32 v7, v8, v4
1785+
; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v11, v12, vcc
1786+
; GFX9-NEXT: v_mul_hi_u32 v11, v8, v3
1787+
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
18021788
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1803-
; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4
1789+
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v11
1790+
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
1791+
; GFX9-NEXT: v_mul_lo_u32 v11, v9, v4
1792+
; GFX9-NEXT: v_mul_hi_u32 v3, v9, v3
1793+
; GFX9-NEXT: v_add_u32_e32 v6, v7, v6
1794+
; GFX9-NEXT: v_mul_hi_u32 v7, v8, v4
1795+
; GFX9-NEXT: v_mul_hi_u32 v4, v9, v4
18041796
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3
1805-
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc
1806-
; GFX9-NEXT: v_mul_lo_u32 v7, s9, v3
1807-
; GFX9-NEXT: v_mul_lo_u32 v8, s8, v4
1808-
; GFX9-NEXT: v_mul_hi_u32 v10, s8, v3
1797+
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
1798+
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7
1799+
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1800+
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
1801+
; GFX9-NEXT: v_add_u32_e32 v7, v11, v7
1802+
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
1803+
; GFX9-NEXT: v_add3_u32 v4, v7, v6, v4
1804+
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3
1805+
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
1806+
; GFX9-NEXT: v_mul_lo_u32 v6, s9, v3
1807+
; GFX9-NEXT: v_mul_lo_u32 v7, s8, v4
1808+
; GFX9-NEXT: v_mul_hi_u32 v9, s8, v3
18091809
; GFX9-NEXT: v_mul_hi_u32 v3, s9, v3
1810-
; GFX9-NEXT: v_mul_hi_u32 v12, s9, v4
1811-
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
1812-
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
1813-
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10
1810+
; GFX9-NEXT: v_mul_hi_u32 v13, s9, v4
1811+
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
18141812
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1815-
; GFX9-NEXT: v_mul_lo_u32 v10, s9, v4
1816-
; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
1817-
; GFX9-NEXT: v_mul_hi_u32 v8, s8, v4
1818-
; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6
1819-
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3
1820-
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
1821-
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8
1822-
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
1823-
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v7
1824-
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0
1825-
; GFX9-NEXT: v_mov_b32_e32 v9, s4
1813+
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9
1814+
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
1815+
; GFX9-NEXT: v_mul_lo_u32 v9, s9, v4
1816+
; GFX9-NEXT: v_add_u32_e32 v6, v7, v6
1817+
; GFX9-NEXT: v_mul_hi_u32 v7, s8, v4
1818+
; GFX9-NEXT: v_xor_b32_e32 v10, s4, v10
1819+
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3
1820+
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
1821+
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7
18261822
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1823+
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v6
1824+
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0
1825+
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
1826+
; GFX9-NEXT: v_add_u32_e32 v7, v9, v7
1827+
; GFX9-NEXT: v_mov_b32_e32 v8, s4
18271828
; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v5
1828-
; GFX9-NEXT: v_add_u32_e32 v8, v10, v8
1829-
; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
1830-
; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12
1829+
; GFX9-NEXT: v_add3_u32 v9, v7, v12, v13
1830+
; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v10, v8, vcc
18311831
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5]
18321832
; GFX9-NEXT: v_mov_b32_e32 v10, s9
18331833
; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v3

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10041,11 +10041,9 @@ define i64 @udiv_i64_gt_smax(i8 %size) {
1004110041
; GFX9-NEXT: v_mov_b32_e32 v1, 0
1004210042
; GFX9-NEXT: s_mov_b32 s6, 0xcccccccc
1004310043
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1]
10044-
; GFX9-NEXT: v_mov_b32_e32 v6, v3
10045-
; GFX9-NEXT: v_mov_b32_e32 v3, v1
10046-
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[2:3]
10047-
; GFX9-NEXT: v_mov_b32_e32 v0, v1
10048-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
10044+
; GFX9-NEXT: v_mov_b32_e32 v0, v2
10045+
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[0:1]
10046+
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v1
1004910047
; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
1005010048
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
1005110049
; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 3

0 commit comments

Comments
 (0)