@@ -359,44 +359,46 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
359
359
; RV32-NEXT: feq.d a0, fa3, fa3
360
360
; RV32-NEXT: fmax.d fa3, fa3, fa5
361
361
; RV32-NEXT: fmin.d fa3, fa3, fa4
362
+ ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
363
+ ; RV32-NEXT: fld fa2, 40(sp)
362
364
; RV32-NEXT: fcvt.w.d a2, fa3, rtz
363
- ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
364
- ; RV32-NEXT: fld fa3, 32(sp)
365
365
; RV32-NEXT: neg a0, a0
366
366
; RV32-NEXT: and a0, a0, a2
367
- ; RV32-NEXT: vslide1down.vx v8, v10, a0
368
- ; RV32-NEXT: feq.d a0, fa3, fa3
369
- ; RV32-NEXT: fmax.d fa3, fa3, fa5
367
+ ; RV32-NEXT: feq.d a2, fa2, fa2
368
+ ; RV32-NEXT: fmax.d fa3, fa2, fa5
370
369
; RV32-NEXT: fmin.d fa3, fa3, fa4
371
- ; RV32-NEXT: fcvt.w.d a2, fa3, rtz
372
- ; RV32-NEXT: fld fa3, 40(sp)
373
- ; RV32-NEXT: neg a0, a0
374
- ; RV32-NEXT: and a0, a0, a2
375
- ; RV32-NEXT: vslide1down.vx v8, v8, a0
376
- ; RV32-NEXT: feq.d a0, fa3, fa3
370
+ ; RV32-NEXT: fcvt.w.d a3, fa3, rtz
371
+ ; RV32-NEXT: fld fa3, 32(sp)
372
+ ; RV32-NEXT: vslide1down.vx v8, v10, a0
373
+ ; RV32-NEXT: neg a0, a2
374
+ ; RV32-NEXT: and a0, a0, a3
375
+ ; RV32-NEXT: feq.d a2, fa3, fa3
376
+ ; RV32-NEXT: neg a2, a2
377
377
; RV32-NEXT: fmax.d fa3, fa3, fa5
378
378
; RV32-NEXT: fmin.d fa3, fa3, fa4
379
- ; RV32-NEXT: fcvt.w.d a2 , fa3, rtz
379
+ ; RV32-NEXT: fcvt.w.d a3 , fa3, rtz
380
380
; RV32-NEXT: fld fa3, 48(sp)
381
- ; RV32-NEXT: neg a0, a0
382
- ; RV32-NEXT: and a0, a0 , a2
383
- ; RV32-NEXT: vslide1down.vx v8, v8 , a0
381
+ ; RV32-NEXT: and a2, a2, a3
382
+ ; RV32-NEXT: vmv.v.x v9 , a2
383
+ ; RV32-NEXT: vslide1down.vx v9, v9 , a0
384
384
; RV32-NEXT: feq.d a0, fa3, fa3
385
385
; RV32-NEXT: fmax.d fa3, fa3, fa5
386
386
; RV32-NEXT: fmin.d fa3, fa3, fa4
387
387
; RV32-NEXT: fcvt.w.d a2, fa3, rtz
388
388
; RV32-NEXT: fld fa3, 56(sp)
389
389
; RV32-NEXT: neg a0, a0
390
390
; RV32-NEXT: and a0, a0, a2
391
- ; RV32-NEXT: vslide1down.vx v8, v8 , a0
391
+ ; RV32-NEXT: vslide1down.vx v9, v9 , a0
392
392
; RV32-NEXT: feq.d a0, fa3, fa3
393
393
; RV32-NEXT: neg a0, a0
394
394
; RV32-NEXT: fmax.d fa5, fa3, fa5
395
395
; RV32-NEXT: fmin.d fa5, fa5, fa4
396
396
; RV32-NEXT: fcvt.w.d a2, fa5, rtz
397
397
; RV32-NEXT: and a0, a0, a2
398
- ; RV32-NEXT: vslide1down.vx v8, v8, a0
399
- ; RV32-NEXT: vse8.v v8, (a1)
398
+ ; RV32-NEXT: vmv.v.i v0, 15
399
+ ; RV32-NEXT: vslide1down.vx v9, v9, a0
400
+ ; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
401
+ ; RV32-NEXT: vse8.v v9, (a1)
400
402
; RV32-NEXT: addi sp, s0, -128
401
403
; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
402
404
; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
@@ -458,44 +460,46 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
458
460
; RV64-NEXT: feq.d a0, fa3, fa3
459
461
; RV64-NEXT: fmax.d fa3, fa3, fa5
460
462
; RV64-NEXT: fmin.d fa3, fa3, fa4
463
+ ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
464
+ ; RV64-NEXT: fld fa2, 40(sp)
461
465
; RV64-NEXT: fcvt.l.d a2, fa3, rtz
462
- ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
463
- ; RV64-NEXT: fld fa3, 32(sp)
464
466
; RV64-NEXT: neg a0, a0
465
467
; RV64-NEXT: and a0, a0, a2
466
- ; RV64-NEXT: vslide1down.vx v8, v10, a0
467
- ; RV64-NEXT: feq.d a0, fa3, fa3
468
- ; RV64-NEXT: fmax.d fa3, fa3, fa5
468
+ ; RV64-NEXT: feq.d a2, fa2, fa2
469
+ ; RV64-NEXT: fmax.d fa3, fa2, fa5
469
470
; RV64-NEXT: fmin.d fa3, fa3, fa4
470
- ; RV64-NEXT: fcvt.l.d a2, fa3, rtz
471
- ; RV64-NEXT: fld fa3, 40(sp)
472
- ; RV64-NEXT: neg a0, a0
473
- ; RV64-NEXT: and a0, a0, a2
474
- ; RV64-NEXT: vslide1down.vx v8, v8, a0
475
- ; RV64-NEXT: feq.d a0, fa3, fa3
471
+ ; RV64-NEXT: fcvt.l.d a3, fa3, rtz
472
+ ; RV64-NEXT: fld fa3, 32(sp)
473
+ ; RV64-NEXT: vslide1down.vx v8, v10, a0
474
+ ; RV64-NEXT: neg a0, a2
475
+ ; RV64-NEXT: and a0, a0, a3
476
+ ; RV64-NEXT: feq.d a2, fa3, fa3
477
+ ; RV64-NEXT: negw a2, a2
476
478
; RV64-NEXT: fmax.d fa3, fa3, fa5
477
479
; RV64-NEXT: fmin.d fa3, fa3, fa4
478
- ; RV64-NEXT: fcvt.l.d a2 , fa3, rtz
480
+ ; RV64-NEXT: fcvt.l.d a3 , fa3, rtz
479
481
; RV64-NEXT: fld fa3, 48(sp)
480
- ; RV64-NEXT: neg a0, a0
481
- ; RV64-NEXT: and a0, a0 , a2
482
- ; RV64-NEXT: vslide1down.vx v8, v8 , a0
482
+ ; RV64-NEXT: and a2, a2, a3
483
+ ; RV64-NEXT: vmv.v.x v9 , a2
484
+ ; RV64-NEXT: vslide1down.vx v9, v9 , a0
483
485
; RV64-NEXT: feq.d a0, fa3, fa3
484
486
; RV64-NEXT: fmax.d fa3, fa3, fa5
485
487
; RV64-NEXT: fmin.d fa3, fa3, fa4
486
488
; RV64-NEXT: fcvt.l.d a2, fa3, rtz
487
489
; RV64-NEXT: fld fa3, 56(sp)
488
490
; RV64-NEXT: neg a0, a0
489
491
; RV64-NEXT: and a0, a0, a2
490
- ; RV64-NEXT: vslide1down.vx v8, v8 , a0
492
+ ; RV64-NEXT: vslide1down.vx v9, v9 , a0
491
493
; RV64-NEXT: feq.d a0, fa3, fa3
492
494
; RV64-NEXT: neg a0, a0
493
495
; RV64-NEXT: fmax.d fa5, fa3, fa5
494
496
; RV64-NEXT: fmin.d fa5, fa5, fa4
495
497
; RV64-NEXT: fcvt.l.d a2, fa5, rtz
496
498
; RV64-NEXT: and a0, a0, a2
497
- ; RV64-NEXT: vslide1down.vx v8, v8, a0
498
- ; RV64-NEXT: vse8.v v8, (a1)
499
+ ; RV64-NEXT: vmv.v.i v0, 15
500
+ ; RV64-NEXT: vslide1down.vx v9, v9, a0
501
+ ; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
502
+ ; RV64-NEXT: vse8.v v9, (a1)
499
503
; RV64-NEXT: addi sp, s0, -128
500
504
; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
501
505
; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
@@ -553,11 +557,11 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
553
557
; RV32-NEXT: vslidedown.vi v8, v8, 3
554
558
; RV32-NEXT: vfmv.f.s fa4, v8
555
559
; RV32-NEXT: fmax.d fa4, fa4, fa3
556
- ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
557
- ; RV32-NEXT: fld fa2, 32 (sp)
560
+ ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
561
+ ; RV32-NEXT: fld fa2, 40 (sp)
558
562
; RV32-NEXT: fmin.d fa4, fa4, fa5
559
563
; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
560
- ; RV32-NEXT: fld fa4, 40 (sp)
564
+ ; RV32-NEXT: fld fa4, 32 (sp)
561
565
; RV32-NEXT: fmax.d fa2, fa2, fa3
562
566
; RV32-NEXT: fmin.d fa2, fa2, fa5
563
567
; RV32-NEXT: fcvt.wu.d a2, fa2, rtz
@@ -570,14 +574,16 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
570
574
; RV32-NEXT: fmin.d fa4, fa4, fa5
571
575
; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
572
576
; RV32-NEXT: fld fa4, 56(sp)
573
- ; RV32-NEXT: vslide1down.vx v8, v8, a2
574
- ; RV32-NEXT: vslide1down.vx v8, v8, a3
575
- ; RV32-NEXT: vslide1down.vx v8, v8 , a0
577
+ ; RV32-NEXT: vmv.v.x v9, a3
578
+ ; RV32-NEXT: vslide1down.vx v9, v9, a2
579
+ ; RV32-NEXT: vslide1down.vx v9, v9 , a0
576
580
; RV32-NEXT: fmax.d fa4, fa4, fa3
577
581
; RV32-NEXT: fmin.d fa5, fa4, fa5
578
582
; RV32-NEXT: fcvt.wu.d a0, fa5, rtz
579
- ; RV32-NEXT: vslide1down.vx v8, v8, a0
580
- ; RV32-NEXT: vse8.v v8, (a1)
583
+ ; RV32-NEXT: vmv.v.i v0, 15
584
+ ; RV32-NEXT: vslide1down.vx v9, v9, a0
585
+ ; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
586
+ ; RV32-NEXT: vse8.v v9, (a1)
581
587
; RV32-NEXT: addi sp, s0, -128
582
588
; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
583
589
; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
@@ -627,11 +633,11 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
627
633
; RV64-NEXT: vslidedown.vi v8, v8, 3
628
634
; RV64-NEXT: vfmv.f.s fa4, v8
629
635
; RV64-NEXT: fmax.d fa4, fa4, fa3
630
- ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
631
- ; RV64-NEXT: fld fa2, 32 (sp)
636
+ ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
637
+ ; RV64-NEXT: fld fa2, 40 (sp)
632
638
; RV64-NEXT: fmin.d fa4, fa4, fa5
633
639
; RV64-NEXT: fcvt.lu.d a0, fa4, rtz
634
- ; RV64-NEXT: fld fa4, 40 (sp)
640
+ ; RV64-NEXT: fld fa4, 32 (sp)
635
641
; RV64-NEXT: fmax.d fa2, fa2, fa3
636
642
; RV64-NEXT: fmin.d fa2, fa2, fa5
637
643
; RV64-NEXT: fcvt.lu.d a2, fa2, rtz
@@ -644,14 +650,16 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
644
650
; RV64-NEXT: fmin.d fa4, fa4, fa5
645
651
; RV64-NEXT: fcvt.lu.d a0, fa4, rtz
646
652
; RV64-NEXT: fld fa4, 56(sp)
647
- ; RV64-NEXT: vslide1down.vx v8, v8, a2
648
- ; RV64-NEXT: vslide1down.vx v8, v8, a3
649
- ; RV64-NEXT: vslide1down.vx v8, v8 , a0
653
+ ; RV64-NEXT: vmv.v.x v9, a3
654
+ ; RV64-NEXT: vslide1down.vx v9, v9, a2
655
+ ; RV64-NEXT: vslide1down.vx v9, v9 , a0
650
656
; RV64-NEXT: fmax.d fa4, fa4, fa3
651
657
; RV64-NEXT: fmin.d fa5, fa4, fa5
652
658
; RV64-NEXT: fcvt.lu.d a0, fa5, rtz
653
- ; RV64-NEXT: vslide1down.vx v8, v8, a0
654
- ; RV64-NEXT: vse8.v v8, (a1)
659
+ ; RV64-NEXT: vmv.v.i v0, 15
660
+ ; RV64-NEXT: vslide1down.vx v9, v9, a0
661
+ ; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
662
+ ; RV64-NEXT: vse8.v v9, (a1)
655
663
; RV64-NEXT: addi sp, s0, -128
656
664
; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
657
665
; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
0 commit comments