@@ -603,9 +603,10 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
603
603
; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
604
604
; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1
605
605
; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
606
- ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
607
- ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
608
- ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
606
+ ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u>
607
+ ; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm0, %ymm0
608
+ ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
609
+ ; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1
609
610
; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
610
611
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
611
612
; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rsi)
@@ -646,9 +647,10 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
646
647
; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0
647
648
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
648
649
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
649
- ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
650
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
651
- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
650
+ ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u>
651
+ ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
652
+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
653
+ ; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
652
654
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
653
655
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
654
656
; AVX512F-NEXT: vmovdqa %ymm3, (%rsi)
@@ -1111,22 +1113,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
1111
1113
;
1112
1114
; AVX512F-LABEL: load_i16_stride3_vf32:
1113
1115
; AVX512F: # %bb.0:
1114
- ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1116
+ ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1115
1117
; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm5
1116
1118
; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm6
1117
- ; AVX512F-NEXT: vmovdqa %ymm0 , %ymm1
1118
- ; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1
1119
- ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1 [2,3,0,1]
1120
- ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1 [0],ymm2[1],ymm1 [2,3],ymm2[4],ymm1 [5,6],ymm2[7],ymm1 [8],ymm2[9],ymm1 [10,11],ymm2[12],ymm1 [13,14],ymm2[15]
1121
- ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm1 [u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
1122
- ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm1
1119
+ ; AVX512F-NEXT: vmovdqa %ymm1 , %ymm0
1120
+ ; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0
1121
+ ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0 [2,3,0,1]
1122
+ ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0 [0],ymm2[1],ymm0 [2,3],ymm2[4],ymm0 [5,6],ymm2[7],ymm0 [8],ymm2[9],ymm0 [10,11],ymm2[12],ymm0 [13,14],ymm2[15]
1123
+ ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm0 [u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
1124
+ ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm0
1123
1125
; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm2
1124
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1 [1],xmm2[2,3],xmm1 [4],xmm2[5,6],xmm1 [7]
1126
+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0 [1],xmm2[2,3],xmm0 [4],xmm2[5,6],xmm0 [7]
1125
1127
; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
1126
1128
; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
1127
1129
; AVX512F-NEXT: vmovdqa (%rdi), %ymm8
1128
1130
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm9
1129
- ; AVX512F-NEXT: vmovdqa %ymm0 , %ymm3
1131
+ ; AVX512F-NEXT: vmovdqa %ymm1 , %ymm3
1130
1132
; AVX512F-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3
1131
1133
; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
1132
1134
; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
@@ -1140,48 +1142,49 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
1140
1142
; AVX512F-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
1141
1143
; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
1142
1144
; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
1143
- ; AVX512F-NEXT: vmovdqa %ymm0 , %ymm10
1145
+ ; AVX512F-NEXT: vmovdqa %ymm1 , %ymm10
1144
1146
; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10
1145
1147
; AVX512F-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1146
1148
; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
1147
- ; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
1148
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
1149
- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
1150
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm10[5,6,7]
1151
- ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1152
- ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
1153
- ; AVX512F-NEXT: vmovdqa %ymm11, %ymm12
1154
- ; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm12
1155
- ; AVX512F-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
1156
- ; AVX512F-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15]
1157
- ; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1158
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
1159
- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
1160
- ; AVX512F-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
1161
- ; AVX512F-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
1162
- ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
1163
- ; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
1164
- ; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10
1165
- ; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm11
1166
- ; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1]
1167
- ; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm11[1,2],ymm5[3],ymm11[4,5],ymm5[6],ymm11[7],ymm5[8],ymm11[9,10],ymm5[11],ymm11[12,13],ymm5[14],ymm11[15]
1149
+ ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
1150
+ ; AVX512F-NEXT: vpshufb %ymm11, %ymm10, %ymm10
1151
+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
1152
+ ; AVX512F-NEXT: vpshufb %xmm11, %xmm12, %xmm12
1153
+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1154
+ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
1155
+ ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
1156
+ ; AVX512F-NEXT: vmovdqa %ymm12, %ymm13
1157
+ ; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm13
1158
+ ; AVX512F-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
1159
+ ; AVX512F-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
1160
+ ; AVX512F-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1161
+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
1162
+ ; AVX512F-NEXT: vpshufb %xmm11, %xmm14, %xmm11
1163
+ ; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1164
+ ; AVX512F-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
1165
+ ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
1166
+ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
1167
+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
1168
+ ; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm12
1169
+ ; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
1170
+ ; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
1168
1171
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
1169
1172
; AVX512F-NEXT: vpshufb %ymm6, %ymm5, %ymm5
1170
- ; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0
1171
- ; AVX512F-NEXT: vpermq {{.*#+}} ymm8 = ymm0 [2,3,0,1]
1172
- ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0 [1,2],ymm8[3],ymm0 [4,5],ymm8[6],ymm0 [7],ymm8[8],ymm0 [9,10],ymm8[11],ymm0 [12,13],ymm8[14],ymm0 [15]
1173
- ; AVX512F-NEXT: vpshufb %ymm6, %ymm0 , %ymm0
1173
+ ; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1
1174
+ ; AVX512F-NEXT: vpermq {{.*#+}} ymm8 = ymm1 [2,3,0,1]
1175
+ ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1 [1,2],ymm8[3],ymm1 [4,5],ymm8[6],ymm1 [7],ymm8[8],ymm1 [9,10],ymm8[11],ymm1 [12,13],ymm8[14],ymm1 [15]
1176
+ ; AVX512F-NEXT: vpshufb %ymm6, %ymm1 , %ymm1
1174
1177
; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
1175
- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
1178
+ ; AVX512F-NEXT: vpshufb %xmm6, % xmm3, % xmm3
1176
1179
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
1177
- ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0 [0,1,2,3,4],ymm3[5,6,7]
1178
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1 [0,1],xmm2[2],xmm1 [3,4],xmm2[5],xmm1 [6,7]
1179
- ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1 [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
1180
- ; AVX512F-NEXT: vinserti64x4 $1, %ymm1 , %zmm0 , %zmm1
1181
- ; AVX512F-NEXT: vextracti32x4 $2, %zmm1 , %xmm1
1182
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1 [0,1,2,3,4],xmm5[5,6,7]
1183
- ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1 [0,1,2,3],ymm5[4,5,6,7]
1184
- ; AVX512F-NEXT: vinserti64x4 $1, %ymm1 , %zmm0 , %zmm0
1180
+ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1 [0,1,2,3,4],ymm3[5,6,7]
1181
+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0 [0,1],xmm2[2],xmm0 [3,4],xmm2[5],xmm0 [6,7]
1182
+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0 [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
1183
+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm0 , %zmm1 , %zmm0
1184
+ ; AVX512F-NEXT: vextracti32x4 $2, %zmm0 , %xmm0
1185
+ ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0 [0,1,2,3,4],xmm5[5,6,7]
1186
+ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0 [0,1,2,3],ymm5[4,5,6,7]
1187
+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm0 , %zmm1 , %zmm0
1185
1188
; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi)
1186
1189
; AVX512F-NEXT: vmovdqa64 %zmm10, (%rdx)
1187
1190
; AVX512F-NEXT: vmovdqa64 %zmm0, (%rcx)
0 commit comments