@@ -1459,58 +1459,53 @@ define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* n
1459
1459
; CHECK-NEXT: cbz r2, .LBB9_3
1460
1460
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1461
1461
; CHECK-NEXT: subs r3, r2, #1
1462
- ; CHECK-NEXT: and r5, r2, #3
1462
+ ; CHECK-NEXT: and lr, r2, #3
1463
+ ; CHECK-NEXT: vldr s0, .LCPI9_0
1463
1464
; CHECK-NEXT: cmp r3, #3
1464
1465
; CHECK-NEXT: bhs .LBB9_4
1465
1466
; CHECK-NEXT: @ %bb.2:
1466
- ; CHECK-NEXT: vldr s0, .LCPI9_0
1467
- ; CHECK-NEXT: mov.w r12, #0
1467
+ ; CHECK-NEXT: movs r2, #0
1468
1468
; CHECK-NEXT: b .LBB9_6
1469
1469
; CHECK-NEXT: .LBB9_3:
1470
1470
; CHECK-NEXT: vldr s0, .LCPI9_0
1471
1471
; CHECK-NEXT: b .LBB9_9
1472
1472
; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
1473
- ; CHECK-NEXT: bic r2, r2, #3
1474
- ; CHECK-NEXT: movs r3, #1
1475
- ; CHECK-NEXT: subs r2, #4
1476
- ; CHECK-NEXT: vldr s0, .LCPI9_0
1477
- ; CHECK-NEXT: mov.w r12, #0
1478
- ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1473
+ ; CHECK-NEXT: sub.w r12, r2, lr
1479
1474
; CHECK-NEXT: movs r3, #0
1480
- ; CHECK-NEXT: dls lr, lr
1475
+ ; CHECK-NEXT: movs r2, #0
1481
1476
; CHECK-NEXT: .LBB9_5: @ %for.body
1482
1477
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1483
- ; CHECK-NEXT: adds r4 , r0, r3
1484
- ; CHECK-NEXT: adds r2 , r1, r3
1485
- ; CHECK-NEXT: vldr.16 s2, [r2 , #6]
1486
- ; CHECK-NEXT: vldr.16 s4, [r4 , #6]
1487
- ; CHECK-NEXT: vldr.16 s6, [r4 , #4]
1488
- ; CHECK-NEXT: vldr.16 s8, [r4 , #2]
1478
+ ; CHECK-NEXT: adds r5 , r0, r3
1479
+ ; CHECK-NEXT: adds r4 , r1, r3
1480
+ ; CHECK-NEXT: vldr.16 s2, [r4 , #6]
1481
+ ; CHECK-NEXT: vldr.16 s4, [r5 , #6]
1482
+ ; CHECK-NEXT: vldr.16 s6, [r5 , #4]
1483
+ ; CHECK-NEXT: vldr.16 s8, [r5 , #2]
1489
1484
; CHECK-NEXT: vmul.f16 s2, s4, s2
1490
- ; CHECK-NEXT: vldr.16 s4, [r2 , #4]
1491
- ; CHECK-NEXT: vldr.16 s10, [r4 ]
1485
+ ; CHECK-NEXT: vldr.16 s4, [r4 , #4]
1486
+ ; CHECK-NEXT: vldr.16 s10, [r5 ]
1492
1487
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1493
1488
; CHECK-NEXT: vmul.f16 s4, s6, s4
1494
- ; CHECK-NEXT: vldr.16 s6, [r2 , #2]
1489
+ ; CHECK-NEXT: vldr.16 s6, [r4 , #2]
1495
1490
; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1496
- ; CHECK-NEXT: adds r3 , #8
1491
+ ; CHECK-NEXT: adds r2 , #4
1497
1492
; CHECK-NEXT: vmul.f16 s6, s8, s6
1498
- ; CHECK-NEXT: vldr.16 s8, [r2 ]
1493
+ ; CHECK-NEXT: vldr.16 s8, [r4 ]
1499
1494
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1500
- ; CHECK-NEXT: add.w r12, r12, #4
1495
+ ; CHECK-NEXT: adds r3, #8
1501
1496
; CHECK-NEXT: vmul.f16 s8, s10, s8
1497
+ ; CHECK-NEXT: cmp r12, r2
1502
1498
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1503
1499
; CHECK-NEXT: vadd.f32 s0, s0, s8
1504
1500
; CHECK-NEXT: vadd.f32 s0, s0, s6
1505
1501
; CHECK-NEXT: vadd.f32 s0, s0, s4
1506
1502
; CHECK-NEXT: vadd.f32 s0, s0, s2
1507
- ; CHECK-NEXT: le lr, .LBB9_5
1503
+ ; CHECK-NEXT: bne .LBB9_5
1508
1504
; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1509
- ; CHECK-NEXT: wls lr, r5 , .LBB9_9
1505
+ ; CHECK-NEXT: wls lr, lr , .LBB9_9
1510
1506
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1511
- ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1512
- ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1513
- ; CHECK-NEXT: mov lr, r5
1507
+ ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1508
+ ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1514
1509
; CHECK-NEXT: .LBB9_8: @ %for.body.epil
1515
1510
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1516
1511
; CHECK-NEXT: vldr.16 s2, [r1]
@@ -1616,58 +1611,53 @@ define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* n
1616
1611
; CHECK-NEXT: cbz r2, .LBB10_3
1617
1612
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1618
1613
; CHECK-NEXT: subs r3, r2, #1
1619
- ; CHECK-NEXT: and r5, r2, #3
1614
+ ; CHECK-NEXT: and lr, r2, #3
1615
+ ; CHECK-NEXT: vldr s0, .LCPI10_0
1620
1616
; CHECK-NEXT: cmp r3, #3
1621
1617
; CHECK-NEXT: bhs .LBB10_4
1622
1618
; CHECK-NEXT: @ %bb.2:
1623
- ; CHECK-NEXT: vldr s0, .LCPI10_0
1624
- ; CHECK-NEXT: mov.w r12, #0
1619
+ ; CHECK-NEXT: movs r2, #0
1625
1620
; CHECK-NEXT: b .LBB10_6
1626
1621
; CHECK-NEXT: .LBB10_3:
1627
1622
; CHECK-NEXT: vldr s0, .LCPI10_0
1628
1623
; CHECK-NEXT: b .LBB10_9
1629
1624
; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
1630
- ; CHECK-NEXT: bic r2, r2, #3
1631
- ; CHECK-NEXT: movs r3, #1
1632
- ; CHECK-NEXT: subs r2, #4
1633
- ; CHECK-NEXT: vldr s0, .LCPI10_0
1634
- ; CHECK-NEXT: mov.w r12, #0
1635
- ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1625
+ ; CHECK-NEXT: sub.w r12, r2, lr
1636
1626
; CHECK-NEXT: movs r3, #0
1637
- ; CHECK-NEXT: dls lr, lr
1627
+ ; CHECK-NEXT: movs r2, #0
1638
1628
; CHECK-NEXT: .LBB10_5: @ %for.body
1639
1629
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1640
- ; CHECK-NEXT: adds r4 , r0, r3
1641
- ; CHECK-NEXT: adds r2 , r1, r3
1642
- ; CHECK-NEXT: vldr.16 s2, [r2 , #6]
1643
- ; CHECK-NEXT: vldr.16 s4, [r4 , #6]
1644
- ; CHECK-NEXT: vldr.16 s6, [r4 , #4]
1645
- ; CHECK-NEXT: vldr.16 s8, [r4 , #2]
1630
+ ; CHECK-NEXT: adds r5 , r0, r3
1631
+ ; CHECK-NEXT: adds r4 , r1, r3
1632
+ ; CHECK-NEXT: vldr.16 s2, [r4 , #6]
1633
+ ; CHECK-NEXT: vldr.16 s4, [r5 , #6]
1634
+ ; CHECK-NEXT: vldr.16 s6, [r5 , #4]
1635
+ ; CHECK-NEXT: vldr.16 s8, [r5 , #2]
1646
1636
; CHECK-NEXT: vadd.f16 s2, s4, s2
1647
- ; CHECK-NEXT: vldr.16 s4, [r2 , #4]
1648
- ; CHECK-NEXT: vldr.16 s10, [r4 ]
1637
+ ; CHECK-NEXT: vldr.16 s4, [r4 , #4]
1638
+ ; CHECK-NEXT: vldr.16 s10, [r5 ]
1649
1639
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1650
1640
; CHECK-NEXT: vadd.f16 s4, s6, s4
1651
- ; CHECK-NEXT: vldr.16 s6, [r2 , #2]
1641
+ ; CHECK-NEXT: vldr.16 s6, [r4 , #2]
1652
1642
; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1653
- ; CHECK-NEXT: adds r3 , #8
1643
+ ; CHECK-NEXT: adds r2 , #4
1654
1644
; CHECK-NEXT: vadd.f16 s6, s8, s6
1655
- ; CHECK-NEXT: vldr.16 s8, [r2 ]
1645
+ ; CHECK-NEXT: vldr.16 s8, [r4 ]
1656
1646
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1657
- ; CHECK-NEXT: add.w r12, r12, #4
1647
+ ; CHECK-NEXT: adds r3, #8
1658
1648
; CHECK-NEXT: vadd.f16 s8, s10, s8
1649
+ ; CHECK-NEXT: cmp r12, r2
1659
1650
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1660
1651
; CHECK-NEXT: vadd.f32 s0, s0, s8
1661
1652
; CHECK-NEXT: vadd.f32 s0, s0, s6
1662
1653
; CHECK-NEXT: vadd.f32 s0, s0, s4
1663
1654
; CHECK-NEXT: vadd.f32 s0, s0, s2
1664
- ; CHECK-NEXT: le lr, .LBB10_5
1655
+ ; CHECK-NEXT: bne .LBB10_5
1665
1656
; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1666
- ; CHECK-NEXT: wls lr, r5 , .LBB10_9
1657
+ ; CHECK-NEXT: wls lr, lr , .LBB10_9
1667
1658
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1668
- ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1669
- ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1670
- ; CHECK-NEXT: mov lr, r5
1659
+ ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1660
+ ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1671
1661
; CHECK-NEXT: .LBB10_8: @ %for.body.epil
1672
1662
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1673
1663
; CHECK-NEXT: vldr.16 s2, [r1]
@@ -1773,65 +1763,60 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
1773
1763
; CHECK-NEXT: cbz r2, .LBB11_3
1774
1764
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1775
1765
; CHECK-NEXT: subs r3, r2, #1
1776
- ; CHECK-NEXT: and r6, r2, #3
1766
+ ; CHECK-NEXT: and lr, r2, #3
1767
+ ; CHECK-NEXT: vldr s0, .LCPI11_0
1777
1768
; CHECK-NEXT: cmp r3, #3
1778
1769
; CHECK-NEXT: bhs .LBB11_4
1779
1770
; CHECK-NEXT: @ %bb.2:
1780
- ; CHECK-NEXT: vldr s0, .LCPI11_0
1781
- ; CHECK-NEXT: mov.w r12, #0
1771
+ ; CHECK-NEXT: movs r2, #0
1782
1772
; CHECK-NEXT: b .LBB11_6
1783
1773
; CHECK-NEXT: .LBB11_3:
1784
1774
; CHECK-NEXT: vldr s0, .LCPI11_0
1785
1775
; CHECK-NEXT: b .LBB11_9
1786
1776
; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
1787
- ; CHECK-NEXT: bic r2, r2, #3
1788
- ; CHECK-NEXT: movs r3, #1
1789
- ; CHECK-NEXT: subs r2, #4
1790
- ; CHECK-NEXT: vldr s0, .LCPI11_0
1791
- ; CHECK-NEXT: mov.w r12, #0
1792
- ; CHECK-NEXT: add.w lr, r3, r2, lsr #2
1777
+ ; CHECK-NEXT: sub.w r12, r2, lr
1793
1778
; CHECK-NEXT: adds r3, r1, #4
1794
- ; CHECK-NEXT: dls lr, lr
1795
- ; CHECK-NEXT: adds r2, r0, #4
1779
+ ; CHECK-NEXT: adds r4, r0, #4
1780
+ ; CHECK-NEXT: movs r2, #0
1796
1781
; CHECK-NEXT: .LBB11_5: @ %for.body
1797
1782
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1798
- ; CHECK-NEXT: ldrsh.w r4, [r3, #2]
1799
- ; CHECK-NEXT: vldr.16 s2, [r2, #2]
1800
- ; CHECK-NEXT: add.w r12, r12, #4
1801
- ; CHECK-NEXT: vmov s4, r4
1802
- ; CHECK-NEXT: ldrsh r4, [r3], #8
1783
+ ; CHECK-NEXT: ldrsh.w r5, [r3, #2]
1784
+ ; CHECK-NEXT: vldr.16 s2, [r4, #2]
1785
+ ; CHECK-NEXT: adds r2, #4
1786
+ ; CHECK-NEXT: cmp r12, r2
1787
+ ; CHECK-NEXT: vmov s4, r5
1788
+ ; CHECK-NEXT: ldrsh r5, [r3], #8
1803
1789
; CHECK-NEXT: vcvt.f16.s32 s4, s4
1804
- ; CHECK-NEXT: ldrsh r5 , [r3, #-10]
1790
+ ; CHECK-NEXT: ldrsh r6 , [r3, #-10]
1805
1791
; CHECK-NEXT: vmul.f16 s2, s2, s4
1806
- ; CHECK-NEXT: vmov s6, r4
1807
- ; CHECK-NEXT: vldr.16 s4, [r2 ]
1792
+ ; CHECK-NEXT: vmov s6, r5
1793
+ ; CHECK-NEXT: vldr.16 s4, [r4 ]
1808
1794
; CHECK-NEXT: vcvt.f16.s32 s6, s6
1809
- ; CHECK-NEXT: ldrsh r4 , [r3, #-12]
1795
+ ; CHECK-NEXT: ldrsh r5 , [r3, #-12]
1810
1796
; CHECK-NEXT: vmul.f16 s4, s4, s6
1811
- ; CHECK-NEXT: vmov s8, r5
1812
- ; CHECK-NEXT: vldr.16 s6, [r2 , #-2]
1797
+ ; CHECK-NEXT: vmov s8, r6
1798
+ ; CHECK-NEXT: vldr.16 s6, [r4 , #-2]
1813
1799
; CHECK-NEXT: vcvt.f16.s32 s8, s8
1814
- ; CHECK-NEXT: vmov s10, r4
1800
+ ; CHECK-NEXT: vmov s10, r5
1815
1801
; CHECK-NEXT: vcvtb.f32.f16 s4, s4
1816
1802
; CHECK-NEXT: vmul.f16 s6, s6, s8
1817
- ; CHECK-NEXT: vldr.16 s8, [r2 , #-4]
1803
+ ; CHECK-NEXT: vldr.16 s8, [r4 , #-4]
1818
1804
; CHECK-NEXT: vcvt.f16.s32 s10, s10
1819
1805
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
1820
1806
; CHECK-NEXT: vmul.f16 s8, s8, s10
1821
1807
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
1822
1808
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
1823
- ; CHECK-NEXT: adds r2 , #8
1809
+ ; CHECK-NEXT: add.w r4, r4 , #8
1824
1810
; CHECK-NEXT: vadd.f32 s0, s0, s8
1825
1811
; CHECK-NEXT: vadd.f32 s0, s0, s6
1826
1812
; CHECK-NEXT: vadd.f32 s0, s0, s4
1827
1813
; CHECK-NEXT: vadd.f32 s0, s0, s2
1828
- ; CHECK-NEXT: le lr, .LBB11_5
1814
+ ; CHECK-NEXT: bne .LBB11_5
1829
1815
; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1830
- ; CHECK-NEXT: wls lr, r6 , .LBB11_9
1816
+ ; CHECK-NEXT: wls lr, lr , .LBB11_9
1831
1817
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
1832
- ; CHECK-NEXT: add.w r0, r0, r12, lsl #1
1833
- ; CHECK-NEXT: add.w r1, r1, r12, lsl #1
1834
- ; CHECK-NEXT: mov lr, r6
1818
+ ; CHECK-NEXT: add.w r0, r0, r2, lsl #1
1819
+ ; CHECK-NEXT: add.w r1, r1, r2, lsl #1
1835
1820
; CHECK-NEXT: .LBB11_8: @ %for.body.epil
1836
1821
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1837
1822
; CHECK-NEXT: ldrsh r2, [r1], #2
0 commit comments