@@ -108,22 +108,21 @@ define void @matrix_mul_signed(i32 %N, ptr nocapture %C, ptr nocapture readonly
108
108
;
109
109
; CHECK-GI-LABEL: matrix_mul_signed:
110
110
; CHECK-GI: // %bb.0: // %vector.header
111
- ; CHECK-GI-NEXT: sxth w9 , w3
111
+ ; CHECK-GI-NEXT: sxth w8 , w3
112
112
; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
113
+ ; CHECK-GI-NEXT: dup v0.4s, w8
113
114
; CHECK-GI-NEXT: sxtw x8, w0
114
- ; CHECK-GI-NEXT: dup v0.4s, w9
115
115
; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
116
+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
116
117
; CHECK-GI-NEXT: .LBB1_1: // %vector.body
117
118
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
118
119
; CHECK-GI-NEXT: add x9, x2, w0, sxtw #1
119
120
; CHECK-GI-NEXT: subs x8, x8, #8
120
121
; CHECK-GI-NEXT: ldp d1, d2, [x9]
121
122
; CHECK-GI-NEXT: add x9, x1, w0, sxtw #2
122
123
; CHECK-GI-NEXT: add w0, w0, #8
123
- ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
124
- ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
125
- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
126
- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
124
+ ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
125
+ ; CHECK-GI-NEXT: smull v2.4s, v0.4h, v2.4h
127
126
; CHECK-GI-NEXT: stp q1, q2, [x9]
128
127
; CHECK-GI-NEXT: b.ne .LBB1_1
129
128
; CHECK-GI-NEXT: // %bb.2: // %for.end12
@@ -305,40 +304,39 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
305
304
; CHECK-GI-NEXT: b.le .LBB3_7
306
305
; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader
307
306
; CHECK-GI-NEXT: sxth w8, w1
308
- ; CHECK-GI-NEXT: mov x9 , xzr
307
+ ; CHECK-GI-NEXT: mov x10 , xzr
309
308
; CHECK-GI-NEXT: cmp w3, #16
310
- ; CHECK-GI-NEXT: mov w10 , w3
309
+ ; CHECK-GI-NEXT: mov w9 , w3
311
310
; CHECK-GI-NEXT: b.lo .LBB3_5
312
311
; CHECK-GI-NEXT: // %bb.2: // %vector.ph
313
312
; CHECK-GI-NEXT: dup v0.4s, w8
314
- ; CHECK-GI-NEXT: and x9, x10 , #0xfffffff0
313
+ ; CHECK-GI-NEXT: and x10, x9 , #0xfffffff0
315
314
; CHECK-GI-NEXT: add x11, x2, #32
316
315
; CHECK-GI-NEXT: add x12, x0, #16
317
- ; CHECK-GI-NEXT: mov x13, x9
316
+ ; CHECK-GI-NEXT: mov x13, x10
317
+ ; CHECK-GI-NEXT: xtn v0.4h, v0.4s
318
318
; CHECK-GI-NEXT: .LBB3_3: // %vector.body
319
319
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
320
320
; CHECK-GI-NEXT: ldp q1, q2, [x12, #-16]
321
321
; CHECK-GI-NEXT: mov x14, x11
322
322
; CHECK-GI-NEXT: subs x13, x13, #16
323
323
; CHECK-GI-NEXT: add x12, x12, #32
324
- ; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0
325
- ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
326
- ; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0
327
- ; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0
328
- ; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s
329
- ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
330
- ; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s
331
- ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
332
- ; CHECK-GI-NEXT: stp q3, q1, [x14, #-32]!
333
- ; CHECK-GI-NEXT: stp q4, q2, [x11], #64
324
+ ; CHECK-GI-NEXT: mov d3, v1.d[1]
325
+ ; CHECK-GI-NEXT: mov d4, v2.d[1]
326
+ ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
327
+ ; CHECK-GI-NEXT: smull v2.4s, v0.4h, v2.4h
328
+ ; CHECK-GI-NEXT: smull v3.4s, v0.4h, v3.4h
329
+ ; CHECK-GI-NEXT: smull v4.4s, v0.4h, v4.4h
330
+ ; CHECK-GI-NEXT: stp q1, q3, [x14, #-32]!
331
+ ; CHECK-GI-NEXT: stp q2, q4, [x11], #64
334
332
; CHECK-GI-NEXT: b.ne .LBB3_3
335
333
; CHECK-GI-NEXT: // %bb.4: // %middle.block
336
- ; CHECK-GI-NEXT: cmp x9, x10
334
+ ; CHECK-GI-NEXT: cmp x10, x9
337
335
; CHECK-GI-NEXT: b.eq .LBB3_7
338
336
; CHECK-GI-NEXT: .LBB3_5: // %for.body.preheader1
339
- ; CHECK-GI-NEXT: add x11, x2, x9 , lsl #2
340
- ; CHECK-GI-NEXT: add x12, x0, x9 , lsl #1
341
- ; CHECK-GI-NEXT: sub x9, x10, x9
337
+ ; CHECK-GI-NEXT: add x11, x2, x10 , lsl #2
338
+ ; CHECK-GI-NEXT: add x12, x0, x10 , lsl #1
339
+ ; CHECK-GI-NEXT: sub x9, x9, x10
342
340
; CHECK-GI-NEXT: .LBB3_6: // %for.body
343
341
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
344
342
; CHECK-GI-NEXT: ldrsh w10, [x12], #2
@@ -834,30 +832,18 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
834
832
; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
835
833
; CHECK-GI-NEXT: mov x8, xzr
836
834
; CHECK-GI-NEXT: dup v0.2d, v0.d[1]
837
- ; CHECK-GI-NEXT: mov x9, v0.d[1]
838
- ; CHECK-GI-NEXT: fmov x10, d0
835
+ ; CHECK-GI-NEXT: xtn v0.2s, v0.2d
839
836
; CHECK-GI-NEXT: .LBB7_1: // %loop
840
837
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
841
- ; CHECK-GI-NEXT: ldr q0 , [x0]
838
+ ; CHECK-GI-NEXT: ldr q1 , [x0]
842
839
; CHECK-GI-NEXT: subs x2, x2, #8
843
840
; CHECK-GI-NEXT: add x8, x8, #8
844
- ; CHECK-GI-NEXT: sshll v1.2d, v0.2s, #0
845
- ; CHECK-GI-NEXT: sshll2 v0.2d, v0.4s, #0
846
- ; CHECK-GI-NEXT: fmov x11, d1
847
- ; CHECK-GI-NEXT: mov x12, v1.d[1]
848
- ; CHECK-GI-NEXT: fmov x13, d0
849
- ; CHECK-GI-NEXT: mov x14, v0.d[1]
850
- ; CHECK-GI-NEXT: mul x11, x11, x10
851
- ; CHECK-GI-NEXT: mul x13, x13, x10
852
- ; CHECK-GI-NEXT: mul x12, x12, x9
853
- ; CHECK-GI-NEXT: mov v0.d[0], x11
854
- ; CHECK-GI-NEXT: mul x11, x14, x9
855
- ; CHECK-GI-NEXT: mov v1.d[0], x13
856
- ; CHECK-GI-NEXT: mov v0.d[1], x12
857
- ; CHECK-GI-NEXT: mov v1.d[1], x11
858
- ; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #15
859
- ; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #15
860
- ; CHECK-GI-NEXT: str q0, [x0], #32
841
+ ; CHECK-GI-NEXT: mov d2, v1.d[1]
842
+ ; CHECK-GI-NEXT: smull v1.2d, v1.2s, v0.2s
843
+ ; CHECK-GI-NEXT: smull v2.2d, v2.2s, v0.2s
844
+ ; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15
845
+ ; CHECK-GI-NEXT: shrn2 v1.4s, v2.2d, #15
846
+ ; CHECK-GI-NEXT: str q1, [x0], #32
861
847
; CHECK-GI-NEXT: b.ne .LBB7_1
862
848
; CHECK-GI-NEXT: // %bb.2: // %exit
863
849
; CHECK-GI-NEXT: ret
@@ -971,18 +957,19 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
971
957
; CHECK-GI: // %bb.0: // %entry
972
958
; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0
973
959
; CHECK-GI-NEXT: mov x8, xzr
960
+ ; CHECK-GI-NEXT: dup v0.8h, v0.h[2]
961
+ ; CHECK-GI-NEXT: xtn v0.8b, v0.8h
974
962
; CHECK-GI-NEXT: .LBB9_1: // %loop
975
963
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
976
964
; CHECK-GI-NEXT: ldr q1, [x0]
977
965
; CHECK-GI-NEXT: subs x2, x2, #8
978
966
; CHECK-GI-NEXT: add x8, x8, #8
979
- ; CHECK-GI-NEXT: sshll v2.8h, v1.8b, #0
980
- ; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0
981
- ; CHECK-GI-NEXT: mul v2.8h, v2.8h, v0.h[2]
982
- ; CHECK-GI-NEXT: mul v1.8h, v1.8h, v0.h[2]
983
- ; CHECK-GI-NEXT: sshr v2.8h, v2.8h, #15
967
+ ; CHECK-GI-NEXT: mov d2, v1.d[1]
968
+ ; CHECK-GI-NEXT: smull v1.8h, v1.8b, v0.8b
969
+ ; CHECK-GI-NEXT: smull v2.8h, v2.8b, v0.8b
984
970
; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15
985
- ; CHECK-GI-NEXT: uzp1 v1.16b, v2.16b, v1.16b
971
+ ; CHECK-GI-NEXT: sshr v2.8h, v2.8h, #15
972
+ ; CHECK-GI-NEXT: uzp1 v1.16b, v1.16b, v2.16b
986
973
; CHECK-GI-NEXT: str q1, [x0], #32
987
974
; CHECK-GI-NEXT: b.ne .LBB9_1
988
975
; CHECK-GI-NEXT: // %bb.2: // %exit
0 commit comments