@@ -284,6 +284,371 @@ define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
284
284
ret i32 %r
285
285
}
286
286
287
+
288
+ define float @nested_fadd_f32 (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
289
+ ; CHECK-LABEL: nested_fadd_f32:
290
+ ; CHECK: // %bb.0:
291
+ ; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
292
+ ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
293
+ ; CHECK-NEXT: faddp s1, v1.2s
294
+ ; CHECK-NEXT: faddp s0, v0.2s
295
+ ; CHECK-NEXT: fadd s1, s1, s3
296
+ ; CHECK-NEXT: fadd s0, s0, s2
297
+ ; CHECK-NEXT: fadd s0, s0, s1
298
+ ; CHECK-NEXT: ret
299
+ %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %a )
300
+ %a1 = fadd fast float %r1 , %c
301
+ %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %b )
302
+ %a2 = fadd fast float %r2 , %d
303
+ %r = fadd fast float %a1 , %a2
304
+ ret float %r
305
+ }
306
+
307
+ define float @nested_fadd_f32_slow (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
308
+ ; CHECK-LABEL: nested_fadd_f32_slow:
309
+ ; CHECK: // %bb.0:
310
+ ; CHECK-NEXT: mov s4, v1.s[2]
311
+ ; CHECK-NEXT: mov s5, v0.s[2]
312
+ ; CHECK-NEXT: faddp s6, v0.2s
313
+ ; CHECK-NEXT: faddp s7, v1.2s
314
+ ; CHECK-NEXT: mov s1, v1.s[3]
315
+ ; CHECK-NEXT: mov s0, v0.s[3]
316
+ ; CHECK-NEXT: fadd s5, s6, s5
317
+ ; CHECK-NEXT: fadd s4, s7, s4
318
+ ; CHECK-NEXT: fadd s0, s5, s0
319
+ ; CHECK-NEXT: fadd s1, s4, s1
320
+ ; CHECK-NEXT: fadd s0, s0, s2
321
+ ; CHECK-NEXT: fadd s1, s1, s3
322
+ ; CHECK-NEXT: fadd s0, s0, s1
323
+ ; CHECK-NEXT: ret
324
+ %r1 = call float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %a )
325
+ %a1 = fadd float %r1 , %c
326
+ %r2 = call float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %b )
327
+ %a2 = fadd float %r2 , %d
328
+ %r = fadd float %a1 , %a2
329
+ ret float %r
330
+ }
331
+
332
+ define float @nested_mul_f32 (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
333
+ ; CHECK-LABEL: nested_mul_f32:
334
+ ; CHECK: // %bb.0:
335
+ ; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
336
+ ; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8
337
+ ; CHECK-NEXT: fmul v1.2s, v1.2s, v4.2s
338
+ ; CHECK-NEXT: fmul v0.2s, v0.2s, v5.2s
339
+ ; CHECK-NEXT: fmul s1, s1, v1.s[1]
340
+ ; CHECK-NEXT: fmul s0, s0, v0.s[1]
341
+ ; CHECK-NEXT: fmul s1, s1, s3
342
+ ; CHECK-NEXT: fmul s0, s0, s2
343
+ ; CHECK-NEXT: fmul s0, s0, s1
344
+ ; CHECK-NEXT: ret
345
+ %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32 (float 1 .0 , <4 x float > %a )
346
+ %a1 = fmul fast float %r1 , %c
347
+ %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32 (float 1 .0 , <4 x float > %b )
348
+ %a2 = fmul fast float %r2 , %d
349
+ %r = fmul fast float %a1 , %a2
350
+ ret float %r
351
+ }
352
+
353
+ define i32 @nested_add_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
354
+ ; CHECK-LABEL: nested_add_i32:
355
+ ; CHECK: // %bb.0:
356
+ ; CHECK-NEXT: addv s1, v1.4s
357
+ ; CHECK-NEXT: addv s0, v0.4s
358
+ ; CHECK-NEXT: fmov w8, s1
359
+ ; CHECK-NEXT: fmov w9, s0
360
+ ; CHECK-NEXT: add w9, w9, w0
361
+ ; CHECK-NEXT: add w8, w8, w1
362
+ ; CHECK-NEXT: add w0, w9, w8
363
+ ; CHECK-NEXT: ret
364
+ %r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
365
+ %a1 = add i32 %r1 , %c
366
+ %r2 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %b )
367
+ %a2 = add i32 %r2 , %d
368
+ %r = add i32 %a1 , %a2
369
+ ret i32 %r
370
+ }
371
+
372
+ define i32 @nested_add_c1_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
373
+ ; CHECK-LABEL: nested_add_c1_i32:
374
+ ; CHECK: // %bb.0:
375
+ ; CHECK-NEXT: addv s1, v1.4s
376
+ ; CHECK-NEXT: addv s0, v0.4s
377
+ ; CHECK-NEXT: fmov w8, s1
378
+ ; CHECK-NEXT: fmov w9, s0
379
+ ; CHECK-NEXT: add w9, w0, w9
380
+ ; CHECK-NEXT: add w8, w8, w1
381
+ ; CHECK-NEXT: add w0, w9, w8
382
+ ; CHECK-NEXT: ret
383
+ %r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
384
+ %a1 = add i32 %c , %r1
385
+ %r2 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %b )
386
+ %a2 = add i32 %r2 , %d
387
+ %r = add i32 %a1 , %a2
388
+ ret i32 %r
389
+ }
390
+
391
+ define i32 @nested_add_c2_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
392
+ ; CHECK-LABEL: nested_add_c2_i32:
393
+ ; CHECK: // %bb.0:
394
+ ; CHECK-NEXT: addv s1, v1.4s
395
+ ; CHECK-NEXT: addv s0, v0.4s
396
+ ; CHECK-NEXT: fmov w8, s1
397
+ ; CHECK-NEXT: fmov w9, s0
398
+ ; CHECK-NEXT: add w9, w9, w0
399
+ ; CHECK-NEXT: add w8, w1, w8
400
+ ; CHECK-NEXT: add w0, w9, w8
401
+ ; CHECK-NEXT: ret
402
+ %r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
403
+ %a1 = add i32 %r1 , %c
404
+ %r2 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %b )
405
+ %a2 = add i32 %d , %r2
406
+ %r = add i32 %a1 , %a2
407
+ ret i32 %r
408
+ }
409
+
410
+ define i32 @nested_add_manyreduct_i32 (<4 x i32 > %a , <4 x i32 > %b , <4 x i32 > %c , <4 x i32 > %d ) {
411
+ ; CHECK-LABEL: nested_add_manyreduct_i32:
412
+ ; CHECK: // %bb.0:
413
+ ; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
414
+ ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
415
+ ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
416
+ ; CHECK-NEXT: addv s0, v0.4s
417
+ ; CHECK-NEXT: fmov w0, s0
418
+ ; CHECK-NEXT: ret
419
+ %r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
420
+ %r3 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %c )
421
+ %a1 = add i32 %r1 , %r3
422
+ %r2 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %b )
423
+ %r4 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %d )
424
+ %a2 = add i32 %r2 , %r4
425
+ %r = add i32 %a1 , %a2
426
+ ret i32 %r
427
+ }
428
+
429
+ define i32 @nested_mul_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
430
+ ; CHECK-LABEL: nested_mul_i32:
431
+ ; CHECK: // %bb.0:
432
+ ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
433
+ ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
434
+ ; CHECK-NEXT: mul v0.2s, v0.2s, v3.2s
435
+ ; CHECK-NEXT: mul v1.2s, v1.2s, v2.2s
436
+ ; CHECK-NEXT: mov w8, v0.s[1]
437
+ ; CHECK-NEXT: fmov w10, s0
438
+ ; CHECK-NEXT: mov w9, v1.s[1]
439
+ ; CHECK-NEXT: mul w8, w10, w8
440
+ ; CHECK-NEXT: fmov w10, s1
441
+ ; CHECK-NEXT: mul w9, w10, w9
442
+ ; CHECK-NEXT: mul w8, w8, w0
443
+ ; CHECK-NEXT: mul w9, w9, w1
444
+ ; CHECK-NEXT: mul w0, w8, w9
445
+ ; CHECK-NEXT: ret
446
+ %r1 = call i32 @llvm.vector.reduce.mul.v4i32 (<4 x i32 > %a )
447
+ %a1 = mul i32 %r1 , %c
448
+ %r2 = call i32 @llvm.vector.reduce.mul.v4i32 (<4 x i32 > %b )
449
+ %a2 = mul i32 %r2 , %d
450
+ %r = mul i32 %a1 , %a2
451
+ ret i32 %r
452
+ }
453
+
454
+ define i32 @nested_and_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
455
+ ; CHECK-LABEL: nested_and_i32:
456
+ ; CHECK: // %bb.0:
457
+ ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
458
+ ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
459
+ ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
460
+ ; CHECK-NEXT: and v0.8b, v0.8b, v3.8b
461
+ ; CHECK-NEXT: fmov x8, d1
462
+ ; CHECK-NEXT: fmov x9, d0
463
+ ; CHECK-NEXT: lsr x10, x9, #32
464
+ ; CHECK-NEXT: lsr x11, x8, #32
465
+ ; CHECK-NEXT: and w9, w9, w0
466
+ ; CHECK-NEXT: and w8, w8, w1
467
+ ; CHECK-NEXT: and w9, w9, w10
468
+ ; CHECK-NEXT: and w8, w8, w11
469
+ ; CHECK-NEXT: and w0, w9, w8
470
+ ; CHECK-NEXT: ret
471
+ %r1 = call i32 @llvm.vector.reduce.and.v4i32 (<4 x i32 > %a )
472
+ %a1 = and i32 %r1 , %c
473
+ %r2 = call i32 @llvm.vector.reduce.and.v4i32 (<4 x i32 > %b )
474
+ %a2 = and i32 %r2 , %d
475
+ %r = and i32 %a1 , %a2
476
+ ret i32 %r
477
+ }
478
+
479
+ define i32 @nested_or_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
480
+ ; CHECK-LABEL: nested_or_i32:
481
+ ; CHECK: // %bb.0:
482
+ ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
483
+ ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
484
+ ; CHECK-NEXT: orr v1.8b, v1.8b, v2.8b
485
+ ; CHECK-NEXT: orr v0.8b, v0.8b, v3.8b
486
+ ; CHECK-NEXT: fmov x8, d1
487
+ ; CHECK-NEXT: fmov x9, d0
488
+ ; CHECK-NEXT: lsr x10, x9, #32
489
+ ; CHECK-NEXT: lsr x11, x8, #32
490
+ ; CHECK-NEXT: orr w9, w9, w0
491
+ ; CHECK-NEXT: orr w8, w8, w1
492
+ ; CHECK-NEXT: orr w9, w9, w10
493
+ ; CHECK-NEXT: orr w8, w8, w11
494
+ ; CHECK-NEXT: orr w0, w9, w8
495
+ ; CHECK-NEXT: ret
496
+ %r1 = call i32 @llvm.vector.reduce.or.v4i32 (<4 x i32 > %a )
497
+ %a1 = or i32 %r1 , %c
498
+ %r2 = call i32 @llvm.vector.reduce.or.v4i32 (<4 x i32 > %b )
499
+ %a2 = or i32 %r2 , %d
500
+ %r = or i32 %a1 , %a2
501
+ ret i32 %r
502
+ }
503
+
504
+ define i32 @nested_xor_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
505
+ ; CHECK-LABEL: nested_xor_i32:
506
+ ; CHECK: // %bb.0:
507
+ ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
508
+ ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
509
+ ; CHECK-NEXT: eor v1.8b, v1.8b, v2.8b
510
+ ; CHECK-NEXT: eor v0.8b, v0.8b, v3.8b
511
+ ; CHECK-NEXT: fmov x8, d1
512
+ ; CHECK-NEXT: fmov x9, d0
513
+ ; CHECK-NEXT: lsr x10, x9, #32
514
+ ; CHECK-NEXT: lsr x11, x8, #32
515
+ ; CHECK-NEXT: eor w9, w9, w0
516
+ ; CHECK-NEXT: eor w8, w8, w1
517
+ ; CHECK-NEXT: eor w9, w9, w10
518
+ ; CHECK-NEXT: eor w8, w8, w11
519
+ ; CHECK-NEXT: eor w0, w9, w8
520
+ ; CHECK-NEXT: ret
521
+ %r1 = call i32 @llvm.vector.reduce.xor.v4i32 (<4 x i32 > %a )
522
+ %a1 = xor i32 %r1 , %c
523
+ %r2 = call i32 @llvm.vector.reduce.xor.v4i32 (<4 x i32 > %b )
524
+ %a2 = xor i32 %r2 , %d
525
+ %r = xor i32 %a1 , %a2
526
+ ret i32 %r
527
+ }
528
+
529
+ define i32 @nested_smin_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
530
+ ; CHECK-LABEL: nested_smin_i32:
531
+ ; CHECK: // %bb.0:
532
+ ; CHECK-NEXT: sminv s0, v0.4s
533
+ ; CHECK-NEXT: sminv s1, v1.4s
534
+ ; CHECK-NEXT: fmov w9, s0
535
+ ; CHECK-NEXT: fmov w8, s1
536
+ ; CHECK-NEXT: cmp w9, w0
537
+ ; CHECK-NEXT: csel w9, w9, w0, lt
538
+ ; CHECK-NEXT: cmp w8, w1
539
+ ; CHECK-NEXT: csel w8, w8, w1, lt
540
+ ; CHECK-NEXT: cmp w9, w8
541
+ ; CHECK-NEXT: csel w0, w9, w8, lt
542
+ ; CHECK-NEXT: ret
543
+ %r1 = call i32 @llvm.vector.reduce.smin.v4i32 (<4 x i32 > %a )
544
+ %a1 = call i32 @llvm.smin.i32 (i32 %r1 , i32 %c )
545
+ %r2 = call i32 @llvm.vector.reduce.smin.v4i32 (<4 x i32 > %b )
546
+ %a2 = call i32 @llvm.smin.i32 (i32 %r2 , i32 %d )
547
+ %r = call i32 @llvm.smin.i32 (i32 %a1 , i32 %a2 )
548
+ ret i32 %r
549
+ }
550
+
551
+ define i32 @nested_smax_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
552
+ ; CHECK-LABEL: nested_smax_i32:
553
+ ; CHECK: // %bb.0:
554
+ ; CHECK-NEXT: smaxv s0, v0.4s
555
+ ; CHECK-NEXT: smaxv s1, v1.4s
556
+ ; CHECK-NEXT: fmov w9, s0
557
+ ; CHECK-NEXT: fmov w8, s1
558
+ ; CHECK-NEXT: cmp w9, w0
559
+ ; CHECK-NEXT: csel w9, w9, w0, gt
560
+ ; CHECK-NEXT: cmp w8, w1
561
+ ; CHECK-NEXT: csel w8, w8, w1, gt
562
+ ; CHECK-NEXT: cmp w9, w8
563
+ ; CHECK-NEXT: csel w0, w9, w8, gt
564
+ ; CHECK-NEXT: ret
565
+ %r1 = call i32 @llvm.vector.reduce.smax.v4i32 (<4 x i32 > %a )
566
+ %a1 = call i32 @llvm.smax.i32 (i32 %r1 , i32 %c )
567
+ %r2 = call i32 @llvm.vector.reduce.smax.v4i32 (<4 x i32 > %b )
568
+ %a2 = call i32 @llvm.smax.i32 (i32 %r2 , i32 %d )
569
+ %r = call i32 @llvm.smax.i32 (i32 %a1 , i32 %a2 )
570
+ ret i32 %r
571
+ }
572
+
573
+ define i32 @nested_umin_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
574
+ ; CHECK-LABEL: nested_umin_i32:
575
+ ; CHECK: // %bb.0:
576
+ ; CHECK-NEXT: uminv s0, v0.4s
577
+ ; CHECK-NEXT: uminv s1, v1.4s
578
+ ; CHECK-NEXT: fmov w9, s0
579
+ ; CHECK-NEXT: fmov w8, s1
580
+ ; CHECK-NEXT: cmp w9, w0
581
+ ; CHECK-NEXT: csel w9, w9, w0, lo
582
+ ; CHECK-NEXT: cmp w8, w1
583
+ ; CHECK-NEXT: csel w8, w8, w1, lo
584
+ ; CHECK-NEXT: cmp w9, w8
585
+ ; CHECK-NEXT: csel w0, w9, w8, lo
586
+ ; CHECK-NEXT: ret
587
+ %r1 = call i32 @llvm.vector.reduce.umin.v4i32 (<4 x i32 > %a )
588
+ %a1 = call i32 @llvm.umin.i32 (i32 %r1 , i32 %c )
589
+ %r2 = call i32 @llvm.vector.reduce.umin.v4i32 (<4 x i32 > %b )
590
+ %a2 = call i32 @llvm.umin.i32 (i32 %r2 , i32 %d )
591
+ %r = call i32 @llvm.umin.i32 (i32 %a1 , i32 %a2 )
592
+ ret i32 %r
593
+ }
594
+
595
+ define i32 @nested_umax_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
596
+ ; CHECK-LABEL: nested_umax_i32:
597
+ ; CHECK: // %bb.0:
598
+ ; CHECK-NEXT: umaxv s0, v0.4s
599
+ ; CHECK-NEXT: umaxv s1, v1.4s
600
+ ; CHECK-NEXT: fmov w9, s0
601
+ ; CHECK-NEXT: fmov w8, s1
602
+ ; CHECK-NEXT: cmp w9, w0
603
+ ; CHECK-NEXT: csel w9, w9, w0, hi
604
+ ; CHECK-NEXT: cmp w8, w1
605
+ ; CHECK-NEXT: csel w8, w8, w1, hi
606
+ ; CHECK-NEXT: cmp w9, w8
607
+ ; CHECK-NEXT: csel w0, w9, w8, hi
608
+ ; CHECK-NEXT: ret
609
+ %r1 = call i32 @llvm.vector.reduce.umax.v4i32 (<4 x i32 > %a )
610
+ %a1 = call i32 @llvm.umax.i32 (i32 %r1 , i32 %c )
611
+ %r2 = call i32 @llvm.vector.reduce.umax.v4i32 (<4 x i32 > %b )
612
+ %a2 = call i32 @llvm.umax.i32 (i32 %r2 , i32 %d )
613
+ %r = call i32 @llvm.umax.i32 (i32 %a1 , i32 %a2 )
614
+ ret i32 %r
615
+ }
616
+
617
+ define float @nested_fmin_float (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
618
+ ; CHECK-LABEL: nested_fmin_float:
619
+ ; CHECK: // %bb.0:
620
+ ; CHECK-NEXT: fminnmv s1, v1.4s
621
+ ; CHECK-NEXT: fminnmv s0, v0.4s
622
+ ; CHECK-NEXT: fminnm s1, s1, s3
623
+ ; CHECK-NEXT: fminnm s0, s0, s2
624
+ ; CHECK-NEXT: fminnm s0, s0, s1
625
+ ; CHECK-NEXT: ret
626
+ %r1 = call float @llvm.vector.reduce.fmin.v4f32 (<4 x float > %a )
627
+ %a1 = call float @llvm.minnum.f32 (float %r1 , float %c )
628
+ %r2 = call float @llvm.vector.reduce.fmin.v4f32 (<4 x float > %b )
629
+ %a2 = call float @llvm.minnum.f32 (float %r2 , float %d )
630
+ %r = call float @llvm.minnum.f32 (float %a1 , float %a2 )
631
+ ret float %r
632
+ }
633
+
634
+ define float @nested_fmax_float (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
635
+ ; CHECK-LABEL: nested_fmax_float:
636
+ ; CHECK: // %bb.0:
637
+ ; CHECK-NEXT: fmaxnmv s1, v1.4s
638
+ ; CHECK-NEXT: fmaxnmv s0, v0.4s
639
+ ; CHECK-NEXT: fmaxnm s1, s1, s3
640
+ ; CHECK-NEXT: fmaxnm s0, s0, s2
641
+ ; CHECK-NEXT: fmaxnm s0, s0, s1
642
+ ; CHECK-NEXT: ret
643
+ %r1 = call float @llvm.vector.reduce.fmax.v4f32 (<4 x float > %a )
644
+ %a1 = call float @llvm.maxnum.f32 (float %r1 , float %c )
645
+ %r2 = call float @llvm.vector.reduce.fmax.v4f32 (<4 x float > %b )
646
+ %a2 = call float @llvm.maxnum.f32 (float %r2 , float %d )
647
+ %r = call float @llvm.maxnum.f32 (float %a1 , float %a2 )
648
+ ret float %r
649
+ }
650
+
651
+
287
652
declare float @llvm.vector.reduce.fadd.f32.v8f32 (float , <8 x float >)
288
653
declare float @llvm.vector.reduce.fadd.f32.v4f32 (float , <4 x float >)
289
654
declare float @llvm.vector.reduce.fmul.f32.v8f32 (float , <8 x float >)
0 commit comments