@@ -606,7 +606,7 @@ define void @srem_v16i32(ptr %a, ptr %b) #0 {
606
606
;
607
607
; VBITS_GE_256-LABEL: srem_v16i32:
608
608
; VBITS_GE_256: // %bb.0:
609
- ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
609
+ ; VBITS_GE_256-NEXT: mov x8, #8
610
610
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
611
611
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
612
612
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
@@ -680,13 +680,13 @@ define void @srem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
680
680
define <1 x i64 > @srem_v1i64 (<1 x i64 > %op1 , <1 x i64 > %op2 ) vscale_range(1 ,0 ) #0 {
681
681
; CHECK-LABEL: srem_v1i64:
682
682
; CHECK: // %bb.0:
683
- ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
684
- ; CHECK-NEXT: ptrue p0.d, vl1
685
683
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
684
+ ; CHECK-NEXT: ptrue p0.d, vl1
685
+ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
686
686
; CHECK-NEXT: movprfx z2, z0
687
687
; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
688
- ; CHECK-NEXT: mls z0 .d, p0/m, z2 .d, z1 .d
689
- ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
688
+ ; CHECK-NEXT: mul z1 .d, p0/m, z1 .d, z2 .d
689
+ ; CHECK-NEXT: sub d0, d0, d1
690
690
; CHECK-NEXT: ret
691
691
%res = srem <1 x i64 > %op1 , %op2
692
692
ret <1 x i64 > %res
@@ -697,13 +697,13 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
697
697
define <2 x i64 > @srem_v2i64 (<2 x i64 > %op1 , <2 x i64 > %op2 ) vscale_range(1 ,0 ) #0 {
698
698
; CHECK-LABEL: srem_v2i64:
699
699
; CHECK: // %bb.0:
700
- ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
701
- ; CHECK-NEXT: ptrue p0.d, vl2
702
700
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
701
+ ; CHECK-NEXT: ptrue p0.d, vl2
702
+ ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
703
703
; CHECK-NEXT: movprfx z2, z0
704
704
; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
705
- ; CHECK-NEXT: mls z0 .d, p0/m, z2 .d, z1 .d
706
- ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
705
+ ; CHECK-NEXT: mul z1 .d, p0/m, z1 .d, z2 .d
706
+ ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
707
707
; CHECK-NEXT: ret
708
708
%res = srem <2 x i64 > %op1 , %op2
709
709
ret <2 x i64 > %res
@@ -730,32 +730,34 @@ define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
730
730
define void @srem_v8i64 (ptr %a , ptr %b ) #0 {
731
731
; VBITS_GE_128-LABEL: srem_v8i64:
732
732
; VBITS_GE_128: // %bb.0:
733
- ; VBITS_GE_128-NEXT: ldp q0, q1 , [x0, #32 ]
733
+ ; VBITS_GE_128-NEXT: ldp q4, q5 , [x1 ]
734
734
; VBITS_GE_128-NEXT: ptrue p0.d, vl2
735
- ; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32]
735
+ ; VBITS_GE_128-NEXT: ldp q7, q6, [x1, #32]
736
+ ; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
737
+ ; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
738
+ ; VBITS_GE_128-NEXT: movprfx z16, z3
739
+ ; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z5.d
740
+ ; VBITS_GE_128-NEXT: movprfx z17, z2
741
+ ; VBITS_GE_128-NEXT: sdiv z17.d, p0/m, z17.d, z4.d
742
+ ; VBITS_GE_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
736
743
; VBITS_GE_128-NEXT: movprfx z16, z1
737
- ; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z3.d
738
- ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d
739
- ; VBITS_GE_128-NEXT: movprfx z3, z0
740
- ; VBITS_GE_128-NEXT: sdiv z3.d, p0/m, z3.d, z2.d
741
- ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d
742
- ; VBITS_GE_128-NEXT: ldp q4, q5, [x0]
743
- ; VBITS_GE_128-NEXT: ldp q7, q6, [x1]
744
- ; VBITS_GE_128-NEXT: movprfx z16, z5
745
744
; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z6.d
746
- ; VBITS_GE_128-NEXT: movprfx z2, z4
747
- ; VBITS_GE_128-NEXT: sdiv z2.d, p0/m, z2.d, z7.d
745
+ ; VBITS_GE_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
746
+ ; VBITS_GE_128-NEXT: movprfx z17, z0
747
+ ; VBITS_GE_128-NEXT: sdiv z17.d, p0/m, z17.d, z7.d
748
+ ; VBITS_GE_128-NEXT: mul z6.d, p0/m, z6.d, z16.d
749
+ ; VBITS_GE_128-NEXT: mul z7.d, p0/m, z7.d, z17.d
750
+ ; VBITS_GE_128-NEXT: sub v0.2d, v0.2d, v7.2d
751
+ ; VBITS_GE_128-NEXT: sub v1.2d, v1.2d, v6.2d
752
+ ; VBITS_GE_128-NEXT: sub v2.2d, v2.2d, v4.2d
748
753
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
749
- ; VBITS_GE_128-NEXT: movprfx z0, z4
750
- ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d
751
- ; VBITS_GE_128-NEXT: movprfx z1, z5
752
- ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d
753
- ; VBITS_GE_128-NEXT: stp q0, q1, [x0]
754
+ ; VBITS_GE_128-NEXT: sub v0.2d, v3.2d, v5.2d
755
+ ; VBITS_GE_128-NEXT: stp q2, q0, [x0]
754
756
; VBITS_GE_128-NEXT: ret
755
757
;
756
758
; VBITS_GE_256-LABEL: srem_v8i64:
757
759
; VBITS_GE_256: // %bb.0:
758
- ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
760
+ ; VBITS_GE_256-NEXT: mov x8, #4
759
761
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
760
762
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
761
763
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
@@ -1424,7 +1426,7 @@ define void @urem_v16i32(ptr %a, ptr %b) #0 {
1424
1426
;
1425
1427
; VBITS_GE_256-LABEL: urem_v16i32:
1426
1428
; VBITS_GE_256: // %bb.0:
1427
- ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1429
+ ; VBITS_GE_256-NEXT: mov x8, #8
1428
1430
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1429
1431
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1430
1432
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
@@ -1498,13 +1500,13 @@ define void @urem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1498
1500
define <1 x i64 > @urem_v1i64 (<1 x i64 > %op1 , <1 x i64 > %op2 ) vscale_range(1 ,0 ) #0 {
1499
1501
; CHECK-LABEL: urem_v1i64:
1500
1502
; CHECK: // %bb.0:
1501
- ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1502
- ; CHECK-NEXT: ptrue p0.d, vl1
1503
1503
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
1504
+ ; CHECK-NEXT: ptrue p0.d, vl1
1505
+ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1504
1506
; CHECK-NEXT: movprfx z2, z0
1505
1507
; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1506
- ; CHECK-NEXT: mls z0 .d, p0/m, z2 .d, z1 .d
1507
- ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1508
+ ; CHECK-NEXT: mul z1 .d, p0/m, z1 .d, z2 .d
1509
+ ; CHECK-NEXT: sub d0, d0, d1
1508
1510
; CHECK-NEXT: ret
1509
1511
%res = urem <1 x i64 > %op1 , %op2
1510
1512
ret <1 x i64 > %res
@@ -1515,13 +1517,13 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
1515
1517
define <2 x i64 > @urem_v2i64 (<2 x i64 > %op1 , <2 x i64 > %op2 ) vscale_range(1 ,0 ) #0 {
1516
1518
; CHECK-LABEL: urem_v2i64:
1517
1519
; CHECK: // %bb.0:
1518
- ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1519
- ; CHECK-NEXT: ptrue p0.d, vl2
1520
1520
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
1521
+ ; CHECK-NEXT: ptrue p0.d, vl2
1522
+ ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1521
1523
; CHECK-NEXT: movprfx z2, z0
1522
1524
; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1523
- ; CHECK-NEXT: mls z0 .d, p0/m, z2 .d, z1 .d
1524
- ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1525
+ ; CHECK-NEXT: mul z1 .d, p0/m, z1 .d, z2 .d
1526
+ ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
1525
1527
; CHECK-NEXT: ret
1526
1528
%res = urem <2 x i64 > %op1 , %op2
1527
1529
ret <2 x i64 > %res
@@ -1548,32 +1550,34 @@ define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1548
1550
define void @urem_v8i64 (ptr %a , ptr %b ) #0 {
1549
1551
; VBITS_GE_128-LABEL: urem_v8i64:
1550
1552
; VBITS_GE_128: // %bb.0:
1551
- ; VBITS_GE_128-NEXT: ldp q0, q1 , [x0, #32 ]
1553
+ ; VBITS_GE_128-NEXT: ldp q4, q5 , [x1 ]
1552
1554
; VBITS_GE_128-NEXT: ptrue p0.d, vl2
1553
- ; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32]
1555
+ ; VBITS_GE_128-NEXT: ldp q7, q6, [x1, #32]
1556
+ ; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
1557
+ ; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
1558
+ ; VBITS_GE_128-NEXT: movprfx z16, z3
1559
+ ; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z5.d
1560
+ ; VBITS_GE_128-NEXT: movprfx z17, z2
1561
+ ; VBITS_GE_128-NEXT: udiv z17.d, p0/m, z17.d, z4.d
1562
+ ; VBITS_GE_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
1554
1563
; VBITS_GE_128-NEXT: movprfx z16, z1
1555
- ; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z3.d
1556
- ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d
1557
- ; VBITS_GE_128-NEXT: movprfx z3, z0
1558
- ; VBITS_GE_128-NEXT: udiv z3.d, p0/m, z3.d, z2.d
1559
- ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d
1560
- ; VBITS_GE_128-NEXT: ldp q4, q5, [x0]
1561
- ; VBITS_GE_128-NEXT: ldp q7, q6, [x1]
1562
- ; VBITS_GE_128-NEXT: movprfx z16, z5
1563
1564
; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z6.d
1564
- ; VBITS_GE_128-NEXT: movprfx z2, z4
1565
- ; VBITS_GE_128-NEXT: udiv z2.d, p0/m, z2.d, z7.d
1565
+ ; VBITS_GE_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
1566
+ ; VBITS_GE_128-NEXT: movprfx z17, z0
1567
+ ; VBITS_GE_128-NEXT: udiv z17.d, p0/m, z17.d, z7.d
1568
+ ; VBITS_GE_128-NEXT: mul z6.d, p0/m, z6.d, z16.d
1569
+ ; VBITS_GE_128-NEXT: mul z7.d, p0/m, z7.d, z17.d
1570
+ ; VBITS_GE_128-NEXT: sub v0.2d, v0.2d, v7.2d
1571
+ ; VBITS_GE_128-NEXT: sub v1.2d, v1.2d, v6.2d
1572
+ ; VBITS_GE_128-NEXT: sub v2.2d, v2.2d, v4.2d
1566
1573
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
1567
- ; VBITS_GE_128-NEXT: movprfx z0, z4
1568
- ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d
1569
- ; VBITS_GE_128-NEXT: movprfx z1, z5
1570
- ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d
1571
- ; VBITS_GE_128-NEXT: stp q0, q1, [x0]
1574
+ ; VBITS_GE_128-NEXT: sub v0.2d, v3.2d, v5.2d
1575
+ ; VBITS_GE_128-NEXT: stp q2, q0, [x0]
1572
1576
; VBITS_GE_128-NEXT: ret
1573
1577
;
1574
1578
; VBITS_GE_256-LABEL: urem_v8i64:
1575
1579
; VBITS_GE_256: // %bb.0:
1576
- ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1580
+ ; VBITS_GE_256-NEXT: mov x8, #4
1577
1581
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1578
1582
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1579
1583
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
0 commit comments