@@ -28,17 +28,16 @@ define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonl
28
28
; CHECK-GI-NEXT: dup v0.4s, w8
29
29
; CHECK-GI-NEXT: mov w8, w0
30
30
; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
31
+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
31
32
; CHECK-GI-NEXT: .LBB0_1: // %vector.body
32
33
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
33
34
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
34
35
; CHECK-GI-NEXT: subs x8, x8, #8
35
36
; CHECK-GI-NEXT: ldp d1, d2, [x9]
36
37
; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2
37
38
; CHECK-GI-NEXT: add w0, w0, #8
38
- ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
39
- ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
40
- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
41
- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
39
+ ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
40
+ ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
42
41
; CHECK-GI-NEXT: stp q1, q2, [x9]
43
42
; CHECK-GI-NEXT: b.ne .LBB0_1
44
43
; CHECK-GI-NEXT: // %bb.2: // %for.end12
@@ -478,22 +477,21 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
478
477
; CHECK-GI-NEXT: mov x12, x8
479
478
; CHECK-GI-NEXT: .LBB4_3: // %vector.body
480
479
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
481
- ; CHECK-GI-NEXT: ldp q0, q1, [x11, #-16]
482
480
; CHECK-GI-NEXT: and w13, w1, #0xffff
483
- ; CHECK-GI-NEXT: dup v2.4s, w13
481
+ ; CHECK-GI-NEXT: ldp q1, q2, [x11, #-16]
482
+ ; CHECK-GI-NEXT: dup v0.4s, w13
484
483
; CHECK-GI-NEXT: mov x13, x10
485
484
; CHECK-GI-NEXT: subs x12, x12, #16
486
485
; CHECK-GI-NEXT: add x11, x11, #32
487
- ; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0
488
- ; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
489
- ; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0
490
- ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
491
- ; CHECK-GI-NEXT: mul v3.4s, v2.4s, v3.4s
492
- ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
493
- ; CHECK-GI-NEXT: mul v4.4s, v2.4s, v4.4s
494
- ; CHECK-GI-NEXT: mul v1.4s, v2.4s, v1.4s
495
- ; CHECK-GI-NEXT: stp q3, q0, [x13, #-32]!
496
- ; CHECK-GI-NEXT: stp q4, q1, [x10], #64
486
+ ; CHECK-GI-NEXT: mov d3, v1.d[1]
487
+ ; CHECK-GI-NEXT: mov d4, v2.d[1]
488
+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
489
+ ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
490
+ ; CHECK-GI-NEXT: umull v3.4s, v0.4h, v3.4h
491
+ ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
492
+ ; CHECK-GI-NEXT: umull v0.4s, v0.4h, v4.4h
493
+ ; CHECK-GI-NEXT: stp q1, q3, [x13, #-32]!
494
+ ; CHECK-GI-NEXT: stp q2, q0, [x10], #64
497
495
; CHECK-GI-NEXT: b.ne .LBB4_3
498
496
; CHECK-GI-NEXT: // %bb.4: // %middle.block
499
497
; CHECK-GI-NEXT: cmp x8, x9
@@ -775,22 +773,15 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
775
773
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
776
774
; CHECK-GI-NEXT: mov x8, xzr
777
775
; CHECK-GI-NEXT: dup v0.2d, v0.d[1]
778
- ; CHECK-GI-NEXT: mov x9, v0.d[1]
779
- ; CHECK-GI-NEXT: fmov x10, d0
776
+ ; CHECK-GI-NEXT: xtn v0.2s, v0.2d
780
777
; CHECK-GI-NEXT: .LBB6_1: // %loop
781
778
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
782
- ; CHECK-GI-NEXT: ldr d0 , [x0]
779
+ ; CHECK-GI-NEXT: ldr d1 , [x0]
783
780
; CHECK-GI-NEXT: subs x2, x2, #8
784
781
; CHECK-GI-NEXT: add x8, x8, #8
785
- ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
786
- ; CHECK-GI-NEXT: fmov x11, d0
787
- ; CHECK-GI-NEXT: mov x12, v0.d[1]
788
- ; CHECK-GI-NEXT: mul x11, x11, x10
789
- ; CHECK-GI-NEXT: mul x12, x12, x9
790
- ; CHECK-GI-NEXT: mov v0.d[0], x11
791
- ; CHECK-GI-NEXT: mov v0.d[1], x12
792
- ; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #15
793
- ; CHECK-GI-NEXT: str d0, [x0], #32
782
+ ; CHECK-GI-NEXT: umull v1.2d, v1.2s, v0.2s
783
+ ; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15
784
+ ; CHECK-GI-NEXT: str d1, [x0], #32
794
785
; CHECK-GI-NEXT: b.ne .LBB6_1
795
786
; CHECK-GI-NEXT: // %bb.2: // %exit
796
787
; CHECK-GI-NEXT: ret
@@ -917,13 +908,14 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
917
908
; CHECK-GI: // %bb.0: // %entry
918
909
; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
919
910
; CHECK-GI-NEXT: mov x8, xzr
911
+ ; CHECK-GI-NEXT: dup v0.8h, v0.h[0]
912
+ ; CHECK-GI-NEXT: xtn v0.8b, v0.8h
920
913
; CHECK-GI-NEXT: .LBB8_1: // %loop
921
914
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
922
915
; CHECK-GI-NEXT: ldr d1, [x0]
923
916
; CHECK-GI-NEXT: subs x2, x2, #8
924
917
; CHECK-GI-NEXT: add x8, x8, #8
925
- ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
926
- ; CHECK-GI-NEXT: mul v1.8h, v1.8h, v0.h[0]
918
+ ; CHECK-GI-NEXT: umull v1.8h, v1.8b, v0.8b
927
919
; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15
928
920
; CHECK-GI-NEXT: xtn v1.8b, v1.8h
929
921
; CHECK-GI-NEXT: str d1, [x0], #32
@@ -1046,17 +1038,16 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
1046
1038
; CHECK-GI-NEXT: dup v0.4s, w8
1047
1039
; CHECK-GI-NEXT: mov w8, w0
1048
1040
; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
1041
+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
1049
1042
; CHECK-GI-NEXT: .LBB10_1: // %vector.body
1050
1043
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
1051
1044
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
1052
1045
; CHECK-GI-NEXT: subs x8, x8, #8
1053
1046
; CHECK-GI-NEXT: ldp d1, d2, [x9]
1054
1047
; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2
1055
1048
; CHECK-GI-NEXT: add w0, w0, #8
1056
- ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
1057
- ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
1058
- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
1059
- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
1049
+ ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
1050
+ ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
1060
1051
; CHECK-GI-NEXT: stp q1, q2, [x9]
1061
1052
; CHECK-GI-NEXT: b.ne .LBB10_1
1062
1053
; CHECK-GI-NEXT: // %bb.2: // %for.end12
@@ -1135,6 +1126,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
1135
1126
; CHECK-GI-NEXT: dup v0.4s, w8
1136
1127
; CHECK-GI-NEXT: mov w8, w0
1137
1128
; CHECK-GI-NEXT: and x8, x8, #0xfffffff0
1129
+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
1138
1130
; CHECK-GI-NEXT: .LBB11_1: // %vector.body
1139
1131
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
1140
1132
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
@@ -1143,16 +1135,14 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
1143
1135
; CHECK-GI-NEXT: ldur q2, [x9, #8]
1144
1136
; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2
1145
1137
; CHECK-GI-NEXT: add w0, w0, #16
1146
- ; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0
1147
- ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
1148
- ; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0
1149
- ; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0
1150
- ; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s
1151
- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
1152
- ; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s
1153
- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
1154
- ; CHECK-GI-NEXT: stp q3, q1, [x9]
1155
- ; CHECK-GI-NEXT: stp q4, q2, [x9, #32]!
1138
+ ; CHECK-GI-NEXT: mov d3, v1.d[1]
1139
+ ; CHECK-GI-NEXT: mov d4, v2.d[1]
1140
+ ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
1141
+ ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
1142
+ ; CHECK-GI-NEXT: umull v3.4s, v0.4h, v3.4h
1143
+ ; CHECK-GI-NEXT: umull v4.4s, v0.4h, v4.4h
1144
+ ; CHECK-GI-NEXT: stp q1, q3, [x9]
1145
+ ; CHECK-GI-NEXT: stp q2, q4, [x9, #32]!
1156
1146
; CHECK-GI-NEXT: b.ne .LBB11_1
1157
1147
; CHECK-GI-NEXT: // %bb.2: // %for.end12
1158
1148
; CHECK-GI-NEXT: ret
0 commit comments