@@ -230,29 +230,34 @@ l2:
230
230
ret <4 x i32 > %c
231
231
}
232
232
233
- define <4 x float > @fmul (< 4 x float > %x , ptr %y ) {
233
+ define <4 x float > @fmul (ptr %x , ptr %y ) {
234
234
; CHECK-LABEL: fmul:
235
235
; CHECK: // %bb.0: // %entry
236
- ; CHECK-NEXT: mov v1.16b, v0.16b
237
- ; CHECK-NEXT: ldr q2, [x0]
238
236
; CHECK-NEXT: movi v0.2d, #0000000000000000
239
- ; CHECK-NEXT: mov w8, #1 // =0x1
240
- ; CHECK-NEXT: fmul v1.4s, v2.4s, v1.s[3]
237
+ ; CHECK-NEXT: ldr s1, [x0]
238
+ ; CHECK-NEXT: mov x8, xzr
241
239
; CHECK-NEXT: .LBB7_1: // %l1
242
240
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
243
- ; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s
244
- ; CHECK-NEXT: subs w8, w8, #1
241
+ ; CHECK-NEXT: ldr q2, [x1, x8]
242
+ ; CHECK-NEXT: add x8, x8, #16
243
+ ; CHECK-NEXT: cmp w8, #16
244
+ ; CHECK-NEXT: fmul v2.4s, v2.4s, v1.s[0]
245
+ ; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
245
246
; CHECK-NEXT: b.eq .LBB7_1
246
247
; CHECK-NEXT: // %bb.2: // %l2
247
248
; CHECK-NEXT: ret
248
249
entry:
249
- %a = shufflevector <4 x float > %x , <4 x float > undef , <4 x i32 > <i32 3 , i32 3 , i32 3 , i32 3 >
250
+ %x.val = load float , ptr %x
251
+ %x.ins = insertelement <4 x float > poison, float %x.val , i64 0
252
+ %a = shufflevector <4 x float > %x.ins , <4 x float > undef , <4 x i32 > zeroinitializer
250
253
br label %l1
251
254
252
255
l1:
253
256
%p = phi i32 [ 0 , %entry ], [ %pa , %l1 ]
254
257
%q = phi <4 x float > [ zeroinitializer , %entry ], [ %c , %l1 ]
255
- %l = load <4 x float >, ptr %y
258
+ %idx.y = mul nuw nsw i32 %p , 4
259
+ %ptr.y = getelementptr float , ptr %y , i32 %idx.y
260
+ %l = load <4 x float >, ptr %ptr.y
256
261
%b = fmul <4 x float > %l , %a
257
262
%c = fadd <4 x float > %b , %q
258
263
%pa = add i32 %p , 1
@@ -270,10 +275,9 @@ define <4 x float> @fmuladd(<4 x float> %x, ptr %y) {
270
275
; CHECK-NEXT: movi v0.2d, #0000000000000000
271
276
; CHECK-NEXT: ldr q2, [x0]
272
277
; CHECK-NEXT: mov w8, #1 // =0x1
273
- ; CHECK-NEXT: dup v1.4s, v1.s[3]
274
278
; CHECK-NEXT: .LBB8_1: // %l1
275
279
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
276
- ; CHECK-NEXT: fmla v0.4s, v1 .4s, v2.4s
280
+ ; CHECK-NEXT: fmla v0.4s, v2 .4s, v1.s[3]
277
281
; CHECK-NEXT: subs w8, w8, #1
278
282
; CHECK-NEXT: b.eq .LBB8_1
279
283
; CHECK-NEXT: // %bb.2: // %l2
@@ -418,6 +422,134 @@ l2:
418
422
ret <4 x i32 > %r
419
423
}
420
424
425
+ ; We shouldn't sink without fullfp16.
426
+ define <4 x half > @fmul_half (ptr %x , ptr %y ) {
427
+ ; CHECK-LABEL: fmul_half:
428
+ ; CHECK: // %bb.0: // %entry
429
+ ; CHECK-NEXT: ld1r { v1.4h }, [x0]
430
+ ; CHECK-NEXT: movi d0, #0000000000000000
431
+ ; CHECK-NEXT: mov x8, xzr
432
+ ; CHECK-NEXT: fcvtl v1.4s, v1.4h
433
+ ; CHECK-NEXT: .LBB13_1: // %l1
434
+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
435
+ ; CHECK-NEXT: ldr d2, [x1, x8]
436
+ ; CHECK-NEXT: fcvtl v0.4s, v0.4h
437
+ ; CHECK-NEXT: add x8, x8, #8
438
+ ; CHECK-NEXT: cmp w8, #8
439
+ ; CHECK-NEXT: fcvtl v2.4s, v2.4h
440
+ ; CHECK-NEXT: fmul v2.4s, v2.4s, v1.4s
441
+ ; CHECK-NEXT: fcvtn v2.4h, v2.4s
442
+ ; CHECK-NEXT: fcvtl v2.4s, v2.4h
443
+ ; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
444
+ ; CHECK-NEXT: fcvtn v0.4h, v0.4s
445
+ ; CHECK-NEXT: b.eq .LBB13_1
446
+ ; CHECK-NEXT: // %bb.2: // %l2
447
+ ; CHECK-NEXT: ret
448
+ entry:
449
+ %x.val = load half , ptr %x
450
+ %x.ins = insertelement <4 x half > poison, half %x.val , i64 0
451
+ %a = shufflevector <4 x half > %x.ins , <4 x half > undef , <4 x i32 > zeroinitializer
452
+ br label %l1
453
+
454
+ l1:
455
+ %p = phi i32 [ 0 , %entry ], [ %pa , %l1 ]
456
+ %q = phi <4 x half > [ zeroinitializer , %entry ], [ %c , %l1 ]
457
+ %idx.y = mul nuw nsw i32 %p , 4
458
+ %ptr.y = getelementptr half , ptr %y , i32 %idx.y
459
+ %l = load <4 x half >, ptr %ptr.y
460
+ %b = fmul <4 x half > %l , %a
461
+ %c = fadd <4 x half > %b , %q
462
+ %pa = add i32 %p , 1
463
+ %c1 = icmp eq i32 %p , 0
464
+ br i1 %c1 , label %l1 , label %l2
465
+
466
+ l2:
467
+ ret <4 x half > %c
468
+ }
469
+
470
+ define <4 x half > @fmul_half_fullfp16 (ptr %x , ptr %y ) "target-features" ="+fullfp16" {
471
+ ; CHECK-LABEL: fmul_half_fullfp16:
472
+ ; CHECK: // %bb.0: // %entry
473
+ ; CHECK-NEXT: movi d0, #0000000000000000
474
+ ; CHECK-NEXT: ldr h1, [x0]
475
+ ; CHECK-NEXT: mov x8, xzr
476
+ ; CHECK-NEXT: .LBB14_1: // %l1
477
+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
478
+ ; CHECK-NEXT: ldr d2, [x1, x8]
479
+ ; CHECK-NEXT: add x8, x8, #8
480
+ ; CHECK-NEXT: cmp w8, #8
481
+ ; CHECK-NEXT: fmul v2.4h, v2.4h, v1.h[0]
482
+ ; CHECK-NEXT: fadd v0.4h, v2.4h, v0.4h
483
+ ; CHECK-NEXT: b.eq .LBB14_1
484
+ ; CHECK-NEXT: // %bb.2: // %l2
485
+ ; CHECK-NEXT: ret
486
+ entry:
487
+ %x.val = load half , ptr %x
488
+ %x.ins = insertelement <4 x half > poison, half %x.val , i64 0
489
+ %a = shufflevector <4 x half > %x.ins , <4 x half > undef , <4 x i32 > zeroinitializer
490
+ br label %l1
491
+
492
+ l1:
493
+ %p = phi i32 [ 0 , %entry ], [ %pa , %l1 ]
494
+ %q = phi <4 x half > [ zeroinitializer , %entry ], [ %c , %l1 ]
495
+ %idx.y = mul nuw nsw i32 %p , 4
496
+ %ptr.y = getelementptr half , ptr %y , i32 %idx.y
497
+ %l = load <4 x half >, ptr %ptr.y
498
+ %b = fmul <4 x half > %l , %a
499
+ %c = fadd <4 x half > %b , %q
500
+ %pa = add i32 %p , 1
501
+ %c1 = icmp eq i32 %p , 0
502
+ br i1 %c1 , label %l1 , label %l2
503
+
504
+ l2:
505
+ ret <4 x half > %c
506
+ }
507
+
508
+ ; We shouldn't sink the splat operand for scalable vectors.
509
+ define <vscale x 4 x float > @fmul_scalable (ptr %x , ptr %y ) "target-features" ="+sve" {
510
+ ; CHECK-LABEL: fmul_scalable:
511
+ ; CHECK: // %bb.0: // %entry
512
+ ; CHECK-NEXT: ptrue p0.s
513
+ ; CHECK-NEXT: rdvl x8, #1
514
+ ; CHECK-NEXT: mov z0.s, #0 // =0x0
515
+ ; CHECK-NEXT: sxtw x8, w8
516
+ ; CHECK-NEXT: mov w9, #1 // =0x1
517
+ ; CHECK-NEXT: ld1rw { z1.s }, p0/z, [x0]
518
+ ; CHECK-NEXT: lsl x8, x8, #2
519
+ ; CHECK-NEXT: .LBB15_1: // %l1
520
+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
521
+ ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1]
522
+ ; CHECK-NEXT: subs w9, w9, #1
523
+ ; CHECK-NEXT: add x1, x1, x8
524
+ ; CHECK-NEXT: fmul z2.s, z2.s, z1.s
525
+ ; CHECK-NEXT: fadd z0.s, z2.s, z0.s
526
+ ; CHECK-NEXT: b.eq .LBB15_1
527
+ ; CHECK-NEXT: // %bb.2: // %l2
528
+ ; CHECK-NEXT: ret
529
+ entry:
530
+ %x.val = load float , ptr %x
531
+ %x.ins = insertelement <vscale x 4 x float > poison, float %x.val , i64 0
532
+ %a = shufflevector <vscale x 4 x float > %x.ins , <vscale x 4 x float > undef , <vscale x 4 x i32 > zeroinitializer
533
+ %33 = tail call i32 @llvm.vscale.i32 ()
534
+ %34 = shl nuw nsw i32 %33 , 4
535
+ br label %l1
536
+
537
+ l1:
538
+ %p = phi i32 [ 0 , %entry ], [ %pa , %l1 ]
539
+ %q = phi <vscale x 4 x float > [ zeroinitializer , %entry ], [ %c , %l1 ]
540
+ %idx.y = mul nuw nsw i32 %p , %34
541
+ %ptr.y = getelementptr float , ptr %y , i32 %idx.y
542
+ %l = load <vscale x 4 x float >, ptr %ptr.y
543
+ %b = fmul <vscale x 4 x float > %l , %a
544
+ %c = fadd <vscale x 4 x float > %b , %q
545
+ %pa = add i32 %p , 1
546
+ %c1 = icmp eq i32 %p , 0
547
+ br i1 %c1 , label %l1 , label %l2
548
+
549
+ l2:
550
+ ret <vscale x 4 x float > %c
551
+ }
552
+
421
553
422
554
declare <4 x i32 > @llvm.aarch64.neon.smull.v4i32 (<4 x i16 >, <4 x i16 >)
423
555
declare <4 x i32 > @llvm.aarch64.neon.umull.v4i32 (<4 x i16 >, <4 x i16 >)
0 commit comments