@@ -501,136 +501,138 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
501
501
; SSE-LABEL: load_i16_stride2_vf64:
502
502
; SSE: # %bb.0:
503
503
; SSE-NEXT: subq $40, %rsp
504
- ; SSE-NEXT: movdqa 96(%rdi), %xmm13
505
- ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
506
- ; SSE-NEXT: movdqa 112(%rdi), %xmm3
507
- ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
508
- ; SSE-NEXT: movdqa 128(%rdi), %xmm11
509
- ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
510
- ; SSE-NEXT: movdqa 144(%rdi), %xmm2
504
+ ; SSE-NEXT: movdqa 160(%rdi), %xmm14
505
+ ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
506
+ ; SSE-NEXT: movdqa 176(%rdi), %xmm2
511
507
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
512
- ; SSE-NEXT: movdqa 160(%rdi), %xmm10
513
- ; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill
514
- ; SSE-NEXT: movdqa 176(%rdi), %xmm4
515
- ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
516
- ; SSE-NEXT: movdqa (%rdi), %xmm9
517
- ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
518
- ; SSE-NEXT: movdqa 16(%rdi), %xmm1
508
+ ; SSE-NEXT: movdqa 64(%rdi), %xmm11
509
+ ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
510
+ ; SSE-NEXT: movdqa 80(%rdi), %xmm1
519
511
; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
520
- ; SSE-NEXT: movdqa 32(%rdi), %xmm12
521
- ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
522
- ; SSE-NEXT: movdqa 48(%rdi), %xmm14
523
- ; SSE-NEXT: movdqa %xmm14, %xmm0
512
+ ; SSE-NEXT: movdqa 96(%rdi), %xmm9
513
+ ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
514
+ ; SSE-NEXT: movdqa 112(%rdi), %xmm4
515
+ ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
516
+ ; SSE-NEXT: movdqa (%rdi), %xmm10
517
+ ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
518
+ ; SSE-NEXT: movdqa 16(%rdi), %xmm7
519
+ ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
520
+ ; SSE-NEXT: movdqa 32(%rdi), %xmm13
521
+ ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
522
+ ; SSE-NEXT: movdqa 48(%rdi), %xmm0
523
+ ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
524
524
; SSE-NEXT: pslld $16, %xmm0
525
525
; SSE-NEXT: psrad $16, %xmm0
526
- ; SSE-NEXT: pslld $16, %xmm12
527
- ; SSE-NEXT: psrad $16, %xmm12
528
- ; SSE-NEXT: packssdw %xmm0, %xmm12
529
- ; SSE-NEXT: movdqa %xmm4 , %xmm0
526
+ ; SSE-NEXT: pslld $16, %xmm13
527
+ ; SSE-NEXT: psrad $16, %xmm13
528
+ ; SSE-NEXT: packssdw %xmm0, %xmm13
529
+ ; SSE-NEXT: movdqa %xmm7 , %xmm0
530
530
; SSE-NEXT: pslld $16, %xmm0
531
531
; SSE-NEXT: psrad $16, %xmm0
532
532
; SSE-NEXT: pslld $16, %xmm10
533
533
; SSE-NEXT: psrad $16, %xmm10
534
534
; SSE-NEXT: packssdw %xmm0, %xmm10
535
- ; SSE-NEXT: movdqa %xmm1 , %xmm0
535
+ ; SSE-NEXT: movdqa %xmm4 , %xmm0
536
536
; SSE-NEXT: pslld $16, %xmm0
537
537
; SSE-NEXT: psrad $16, %xmm0
538
538
; SSE-NEXT: pslld $16, %xmm9
539
539
; SSE-NEXT: psrad $16, %xmm9
540
540
; SSE-NEXT: packssdw %xmm0, %xmm9
541
- ; SSE-NEXT: movdqa %xmm2 , %xmm0
541
+ ; SSE-NEXT: movdqa %xmm1 , %xmm0
542
542
; SSE-NEXT: pslld $16, %xmm0
543
543
; SSE-NEXT: psrad $16, %xmm0
544
544
; SSE-NEXT: pslld $16, %xmm11
545
545
; SSE-NEXT: psrad $16, %xmm11
546
546
; SSE-NEXT: packssdw %xmm0, %xmm11
547
- ; SSE-NEXT: movdqa %xmm3 , %xmm0
547
+ ; SSE-NEXT: movdqa %xmm2 , %xmm0
548
548
; SSE-NEXT: pslld $16, %xmm0
549
549
; SSE-NEXT: psrad $16, %xmm0
550
- ; SSE-NEXT: pslld $16, %xmm13
551
- ; SSE-NEXT: psrad $16, %xmm13
552
- ; SSE-NEXT: packssdw %xmm0, %xmm13
553
- ; SSE-NEXT: movdqa 240 (%rdi), %xmm0
554
- ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
550
+ ; SSE-NEXT: pslld $16, %xmm14
551
+ ; SSE-NEXT: psrad $16, %xmm14
552
+ ; SSE-NEXT: packssdw %xmm0, %xmm14
553
+ ; SSE-NEXT: movdqa 144 (%rdi), %xmm7
554
+ ; SSE-NEXT: movdqa %xmm7, %xmm0
555
555
; SSE-NEXT: pslld $16, %xmm0
556
556
; SSE-NEXT: psrad $16, %xmm0
557
- ; SSE-NEXT: movdqa 224 (%rdi), %xmm7
558
- ; SSE-NEXT: movdqa %xmm7 , %xmm15
557
+ ; SSE-NEXT: movdqa 128 (%rdi), %xmm8
558
+ ; SSE-NEXT: movdqa %xmm8 , %xmm15
559
559
; SSE-NEXT: pslld $16, %xmm15
560
560
; SSE-NEXT: psrad $16, %xmm15
561
561
; SSE-NEXT: packssdw %xmm0, %xmm15
562
- ; SSE-NEXT: movdqa 80 (%rdi), %xmm3
563
- ; SSE-NEXT: movdqa %xmm3 , %xmm1
562
+ ; SSE-NEXT: movdqa 240 (%rdi), %xmm12
563
+ ; SSE-NEXT: movdqa %xmm12 , %xmm1
564
564
; SSE-NEXT: pslld $16, %xmm1
565
565
; SSE-NEXT: psrad $16, %xmm1
566
- ; SSE-NEXT: movdqa 64(%rdi), %xmm5
567
- ; SSE-NEXT: movdqa %xmm5, %xmm4
566
+ ; SSE-NEXT: movdqa 224(%rdi), %xmm5
567
+ ; SSE-NEXT: movdqa %xmm5, %xmm3
568
+ ; SSE-NEXT: pslld $16, %xmm3
569
+ ; SSE-NEXT: psrad $16, %xmm3
570
+ ; SSE-NEXT: packssdw %xmm1, %xmm3
571
+ ; SSE-NEXT: movdqa 208(%rdi), %xmm6
572
+ ; SSE-NEXT: movdqa %xmm6, %xmm4
568
573
; SSE-NEXT: pslld $16, %xmm4
569
574
; SSE-NEXT: psrad $16, %xmm4
570
- ; SSE-NEXT: packssdw %xmm1, %xmm4
571
- ; SSE-NEXT: movdqa 208(%rdi), %xmm8
572
- ; SSE-NEXT: movdqa %xmm8, %xmm6
573
- ; SSE-NEXT: pslld $16, %xmm6
574
- ; SSE-NEXT: psrad $16, %xmm6
575
575
; SSE-NEXT: movdqa 192(%rdi), %xmm2
576
576
; SSE-NEXT: movdqa %xmm2, %xmm1
577
577
; SSE-NEXT: pslld $16, %xmm1
578
578
; SSE-NEXT: psrad $16, %xmm1
579
- ; SSE-NEXT: packssdw %xmm6, %xmm1
580
- ; SSE-NEXT: psrad $16, %xmm14
581
- ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
582
- ; SSE-NEXT: psrad $16, %xmm0
583
- ; SSE-NEXT: packssdw %xmm14, %xmm0
584
- ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
579
+ ; SSE-NEXT: packssdw %xmm4, %xmm1
585
580
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
586
581
; SSE-NEXT: psrad $16, %xmm0
587
- ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
588
- ; SSE-NEXT: psrad $16, %xmm6
589
- ; SSE-NEXT: packssdw %xmm0, %xmm6
590
- ; SSE-NEXT: movdqa %xmm6 , {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
582
+ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
583
+ ; SSE-NEXT: psrad $16, %xmm4
584
+ ; SSE-NEXT: packssdw %xmm0, %xmm4
585
+ ; SSE-NEXT: movdqa %xmm4 , {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
591
586
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
592
587
; SSE-NEXT: psrad $16, %xmm0
593
- ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
594
- ; SSE-NEXT: psrad $16, %xmm14
595
- ; SSE-NEXT: packssdw %xmm0, %xmm14
596
- ; SSE-NEXT: psrad $16, %xmm3
597
- ; SSE-NEXT: psrad $16, %xmm5
598
- ; SSE-NEXT: packssdw %xmm3, %xmm5
588
+ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
589
+ ; SSE-NEXT: psrad $16, %xmm4
590
+ ; SSE-NEXT: packssdw %xmm0, %xmm4
591
+ ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
599
592
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
600
593
; SSE-NEXT: psrad $16, %xmm0
601
- ; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload
602
- ; SSE-NEXT: psrad $16, %xmm6
603
- ; SSE-NEXT: packssdw %xmm0, %xmm6
604
- ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
594
+ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
595
+ ; SSE-NEXT: psrad $16, %xmm4
596
+ ; SSE-NEXT: packssdw %xmm0, %xmm4
597
+ ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
598
+ ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
605
599
; SSE-NEXT: psrad $16, %xmm0
606
- ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
607
- ; SSE-NEXT: psrad $16, %xmm3
608
- ; SSE-NEXT: packssdw %xmm0, %xmm3
600
+ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
601
+ ; SSE-NEXT: psrad $16, %xmm4
602
+ ; SSE-NEXT: packssdw %xmm0, %xmm4
603
+ ; SSE-NEXT: psrad $16, %xmm7
604
+ ; SSE-NEXT: psrad $16, %xmm8
605
+ ; SSE-NEXT: packssdw %xmm7, %xmm8
609
606
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
610
607
; SSE-NEXT: psrad $16, %xmm0
608
+ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
611
609
; SSE-NEXT: psrad $16, %xmm7
612
610
; SSE-NEXT: packssdw %xmm0, %xmm7
613
- ; SSE-NEXT: psrad $16, %xmm8
611
+ ; SSE-NEXT: psrad $16, %xmm6
614
612
; SSE-NEXT: psrad $16, %xmm2
615
- ; SSE-NEXT: packssdw %xmm8, %xmm2
613
+ ; SSE-NEXT: packssdw %xmm6, %xmm2
614
+ ; SSE-NEXT: psrad $16, %xmm12
615
+ ; SSE-NEXT: psrad $16, %xmm5
616
+ ; SSE-NEXT: packssdw %xmm12, %xmm5
616
617
; SSE-NEXT: movdqa %xmm1, 96(%rsi)
617
- ; SSE-NEXT: movdqa %xmm4, 32(%rsi)
618
- ; SSE-NEXT: movdqa %xmm15, 112(%rsi)
619
- ; SSE-NEXT: movdqa %xmm13, 48(%rsi)
620
- ; SSE-NEXT: movdqa %xmm11, 64(%rsi)
621
- ; SSE-NEXT: movdqa %xmm9, (%rsi)
622
- ; SSE-NEXT: movdqa %xmm10, 80(%rsi)
623
- ; SSE-NEXT: movdqa %xmm12, 16(%rsi)
618
+ ; SSE-NEXT: movdqa %xmm3, 112(%rsi)
619
+ ; SSE-NEXT: movdqa %xmm15, 64(%rsi)
620
+ ; SSE-NEXT: movdqa %xmm14, 80(%rsi)
621
+ ; SSE-NEXT: movdqa %xmm11, 32(%rsi)
622
+ ; SSE-NEXT: movdqa %xmm9, 48(%rsi)
623
+ ; SSE-NEXT: movdqa %xmm10, (%rsi)
624
+ ; SSE-NEXT: movdqa %xmm13, 16(%rsi)
625
+ ; SSE-NEXT: movdqa %xmm5, 112(%rdx)
624
626
; SSE-NEXT: movdqa %xmm2, 96(%rdx)
625
- ; SSE-NEXT: movdqa %xmm7, 112(%rdx)
626
- ; SSE-NEXT: movdqa %xmm3, 64(%rdx)
627
- ; SSE-NEXT: movdqa %xmm6, 80(%rdx)
628
- ; SSE-NEXT: movdqa %xmm5, 32(%rdx)
629
- ; SSE-NEXT: movdqa %xmm14, 48(%rdx)
627
+ ; SSE-NEXT: movdqa %xmm7, 80(%rdx)
628
+ ; SSE-NEXT: movdqa %xmm8, 64(%rdx)
629
+ ; SSE-NEXT: movdqa %xmm4, 48(%rdx)
630
630
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
631
- ; SSE-NEXT: movaps %xmm0, (%rdx)
631
+ ; SSE-NEXT: movaps %xmm0, 32 (%rdx)
632
632
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
633
633
; SSE-NEXT: movaps %xmm0, 16(%rdx)
634
+ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
635
+ ; SSE-NEXT: movaps %xmm0, (%rdx)
634
636
; SSE-NEXT: addq $40, %rsp
635
637
; SSE-NEXT: retq
636
638
;
0 commit comments