@@ -490,43 +490,38 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
490
490
;
491
491
; AVX512F-LABEL: var_funnnel_v32i8:
492
492
; AVX512F: # %bb.0:
493
- ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
494
- ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
493
+ ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
494
+ ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
495
495
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
496
- ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
497
- ; AVX512F-NEXT: vpsubb %ymm1, %ymm2, %ymm1
498
496
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
499
497
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
500
- ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
501
- ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
498
+ ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2
499
+ ; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3
502
500
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
503
501
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
504
502
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
505
- ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
506
- ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
507
- ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
508
- ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
503
+ ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2
504
+ ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
505
+ ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
509
506
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
510
- ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2 , %ymm0, %ymm0
507
+ ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3 , %ymm0, %ymm0
511
508
; AVX512F-NEXT: retq
512
509
;
513
510
; AVX512VL-LABEL: var_funnnel_v32i8:
514
511
; AVX512VL: # %bb.0:
515
- ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
516
- ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
512
+ ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
513
+ ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
517
514
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
518
- ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
519
- ; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm1
520
515
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
521
516
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
522
- ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
523
- ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
517
+ ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2
518
+ ; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3
524
519
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
525
520
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
526
521
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
527
- ; AVX512VL-NEXT: vpsrlw $7 , %ymm0, %ymm2
528
- ; AVX512VL-NEXT: vpaddb %ymm0 , %ymm0, %ymm3
529
- ; AVX512VL-NEXT: vpternlogq $248 , {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
522
+ ; AVX512VL-NEXT: vpsrlw $1 , %ymm0, %ymm2
523
+ ; AVX512VL-NEXT: vpsllw $7 , %ymm0, %ymm3
524
+ ; AVX512VL-NEXT: vpternlogq $216 , {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4} , %ymm2, %ymm3
530
525
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
531
526
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
532
527
; AVX512VL-NEXT: retq
@@ -975,70 +970,65 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
975
970
define <32 x i8 > @splatvar_funnnel_v32i8 (<32 x i8 > %x , <32 x i8 > %amt ) nounwind {
976
971
; AVX1-LABEL: splatvar_funnnel_v32i8:
977
972
; AVX1: # %bb.0:
978
- ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
979
- ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
980
- ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
981
973
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
982
974
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
983
975
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
984
- ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
985
- ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
976
+ ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
977
+ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
978
+ ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
986
979
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
987
- ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
988
- ; AVX1-NEXT: vpsrlw $8 , %xmm2, %xmm2
980
+ ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
981
+ ; AVX1-NEXT: vpand %xmm4 , %xmm2, %xmm2
989
982
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
990
983
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
991
- ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
992
- ; AVX1-NEXT: vpsrlw $8 , %xmm3, %xmm3
984
+ ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
985
+ ; AVX1-NEXT: vpand %xmm4 , %xmm3, %xmm3
993
986
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
994
- ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
995
- ; AVX1-NEXT: vpsrlw $8 , %xmm0, %xmm0
987
+ ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
988
+ ; AVX1-NEXT: vpand %xmm4 , %xmm0, %xmm0
996
989
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
997
990
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
998
991
; AVX1-NEXT: retq
999
992
;
1000
993
; AVX2-LABEL: splatvar_funnnel_v32i8:
1001
994
; AVX2: # %bb.0:
1002
- ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1003
- ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
995
+ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1004
996
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1005
997
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1006
- ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1007
- ; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1008
- ; AVX2-NEXT: vpsrlw $8 , %ymm2, %ymm2
998
+ ; AVX2-NEXT: vpsrlw %xmm1, % ymm2, %ymm2
999
+ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1000
+ ; AVX2-NEXT: vpand %ymm3 , %ymm2, %ymm2
1009
1001
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1010
- ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1011
- ; AVX2-NEXT: vpsrlw $8 , %ymm0, %ymm0
1002
+ ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1003
+ ; AVX2-NEXT: vpand %ymm3 , %ymm0, %ymm0
1012
1004
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1013
1005
; AVX2-NEXT: retq
1014
1006
;
1015
1007
; AVX512F-LABEL: splatvar_funnnel_v32i8:
1016
1008
; AVX512F: # %bb.0:
1017
- ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1018
- ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1009
+ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1019
1010
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1020
1011
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1021
- ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1022
- ; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1023
- ; AVX512F-NEXT: vpsrlw $8 , %ymm2, %ymm2
1012
+ ; AVX512F-NEXT: vpsrlw %xmm1, % ymm2, %ymm2
1013
+ ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1014
+ ; AVX512F-NEXT: vpand %ymm3 , %ymm2, %ymm2
1024
1015
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1025
- ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1026
- ; AVX512F-NEXT: vpsrlw $8 , %ymm0, %ymm0
1016
+ ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1017
+ ; AVX512F-NEXT: vpand %ymm3 , %ymm0, %ymm0
1027
1018
; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1028
1019
; AVX512F-NEXT: retq
1029
1020
;
1030
1021
; AVX512VL-LABEL: splatvar_funnnel_v32i8:
1031
1022
; AVX512VL: # %bb.0:
1032
- ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
1033
- ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1023
+ ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1034
1024
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1035
1025
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1036
- ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1037
- ; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1038
- ; AVX512VL-NEXT: vpsrlw $8 , %ymm2, %ymm2
1026
+ ; AVX512VL-NEXT: vpsrlw %xmm1, % ymm2, %ymm2
1027
+ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1028
+ ; AVX512VL-NEXT: vpand %ymm3 , %ymm2, %ymm2
1039
1029
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1040
- ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1041
- ; AVX512VL-NEXT: vpsrlw $8 , %ymm0, %ymm0
1030
+ ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1031
+ ; AVX512VL-NEXT: vpand %ymm3 , %ymm0, %ymm0
1042
1032
; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1043
1033
; AVX512VL-NEXT: retq
1044
1034
;
0 commit comments