@@ -42,11 +42,10 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
42
42
;
43
43
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
44
44
; AVX512BW: # %bb.0:
45
- ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
46
- ; AVX512BW-NEXT: vpand 16(%rdi), %xmm0, %xmm1
47
- ; AVX512BW-NEXT: vpand (%rdi), %xmm0, %xmm0
48
- ; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
45
+ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
46
+ ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
49
47
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
48
+ ; AVX512BW-NEXT: vzeroupper
50
49
; AVX512BW-NEXT: retq
51
50
;
52
51
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
@@ -143,11 +142,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
143
142
;
144
143
; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
145
144
; AVX512F: # %bb.0:
146
- ; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
147
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
148
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
149
- ; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
145
+ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
146
+ ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
150
147
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
148
+ ; AVX512F-NEXT: vzeroupper
151
149
; AVX512F-NEXT: retq
152
150
;
153
151
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
@@ -159,11 +157,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
159
157
;
160
158
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
161
159
; AVX512BW: # %bb.0:
162
- ; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
163
- ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
164
- ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
165
- ; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
160
+ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
161
+ ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
166
162
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
163
+ ; AVX512BW-NEXT: vzeroupper
167
164
; AVX512BW-NEXT: retq
168
165
;
169
166
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
@@ -377,54 +374,42 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
377
374
;
378
375
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
379
376
; AVX512F: # %bb.0:
380
- ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
381
- ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
382
- ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
383
- ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
384
- ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
385
- ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
377
+ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
378
+ ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
386
379
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
380
+ ; AVX512F-NEXT: vzeroupper
387
381
; AVX512F-NEXT: retq
388
382
;
389
383
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
390
384
; AVX512VL: # %bb.0:
391
- ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
392
- ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
393
- ; AVX512VL-NEXT: vpmovdb %xmm1, %xmm1
394
- ; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
395
- ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
385
+ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
386
+ ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
396
387
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
388
+ ; AVX512VL-NEXT: vzeroupper
397
389
; AVX512VL-NEXT: retq
398
390
;
399
391
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
400
392
; AVX512BW: # %bb.0:
401
- ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
402
- ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
403
- ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
404
- ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
405
- ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
406
- ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
393
+ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
394
+ ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
407
395
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
396
+ ; AVX512BW-NEXT: vzeroupper
408
397
; AVX512BW-NEXT: retq
409
398
;
410
399
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
411
400
; AVX512BWVL: # %bb.0:
412
- ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
413
- ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
414
- ; AVX512BWVL-NEXT: vpmovdb %xmm1, %xmm1
415
- ; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0
416
- ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
401
+ ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
402
+ ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
417
403
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
404
+ ; AVX512BWVL-NEXT: vzeroupper
418
405
; AVX512BWVL-NEXT: retq
419
406
;
420
407
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
421
408
; AVX512VBMIVL: # %bb.0:
422
- ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
423
- ; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
424
- ; AVX512VBMIVL-NEXT: vpmovdb %xmm1, %xmm1
425
- ; AVX512VBMIVL-NEXT: vpmovdb %xmm0, %xmm0
426
- ; AVX512VBMIVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
409
+ ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
410
+ ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
427
411
; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
412
+ ; AVX512VBMIVL-NEXT: vzeroupper
428
413
; AVX512VBMIVL-NEXT: retq
429
414
%vec = load <32 x i8 >, <32 x i8 >* %L
430
415
%strided.vec = shufflevector <32 x i8 > %vec , <32 x i8 > undef , <8 x i32 > <i32 0 , i32 4 , i32 8 , i32 12 , i32 16 , i32 20 , i32 24 , i32 28 >
@@ -1081,49 +1066,42 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
1081
1066
;
1082
1067
; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
1083
1068
; AVX512F: # %bb.0:
1084
- ; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
1085
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1086
- ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1087
- ; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1088
- ; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1069
+ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1070
+ ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1089
1071
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
1072
+ ; AVX512F-NEXT: vzeroupper
1090
1073
; AVX512F-NEXT: retq
1091
1074
;
1092
1075
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
1093
1076
; AVX512VL: # %bb.0:
1094
- ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
1095
- ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
1096
- ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
1097
- ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1098
- ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1099
- ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1077
+ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1078
+ ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
1100
1079
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
1080
+ ; AVX512VL-NEXT: vzeroupper
1101
1081
; AVX512VL-NEXT: retq
1102
1082
;
1103
1083
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
1104
1084
; AVX512BW: # %bb.0:
1105
- ; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
1106
- ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1107
- ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1108
- ; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1109
- ; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1085
+ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1086
+ ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1110
1087
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
1088
+ ; AVX512BW-NEXT: vzeroupper
1111
1089
; AVX512BW-NEXT: retq
1112
1090
;
1113
1091
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
1114
1092
; AVX512BWVL: # %bb.0:
1115
- ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
1116
- ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
1117
- ; AVX512BWVL-NEXT: vpermi2w 16(%rdi), % xmm0, %xmm1
1118
- ; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
1093
+ ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1094
+ ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
1095
+ ; AVX512BWVL-NEXT: vmovq % xmm0, (%rsi)
1096
+ ; AVX512BWVL-NEXT: vzeroupper
1119
1097
; AVX512BWVL-NEXT: retq
1120
1098
;
1121
1099
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
1122
1100
; AVX512VBMIVL: # %bb.0:
1123
- ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
1124
- ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
1125
- ; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), % xmm0, %xmm1
1126
- ; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
1101
+ ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1102
+ ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
1103
+ ; AVX512VBMIVL-NEXT: vmovq % xmm0, (%rsi)
1104
+ ; AVX512VBMIVL-NEXT: vzeroupper
1127
1105
; AVX512VBMIVL-NEXT: retq
1128
1106
%vec = load <16 x i16 >, <16 x i16 >* %L
1129
1107
%strided.vec = shufflevector <16 x i16 > %vec , <16 x i16 > undef , <4 x i32 > <i32 0 , i32 4 , i32 8 , i32 12 >
@@ -1199,54 +1177,42 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1199
1177
;
1200
1178
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
1201
1179
; AVX512F: # %bb.0:
1202
- ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1203
- ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
1204
- ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1205
- ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1206
- ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1207
- ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1180
+ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1181
+ ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1208
1182
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1183
+ ; AVX512F-NEXT: vzeroupper
1209
1184
; AVX512F-NEXT: retq
1210
1185
;
1211
1186
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
1212
1187
; AVX512VL: # %bb.0:
1213
- ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
1214
- ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
1215
- ; AVX512VL-NEXT: vpmovqb %xmm1, %xmm1
1216
- ; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0
1217
- ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1188
+ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1189
+ ; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
1218
1190
; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
1191
+ ; AVX512VL-NEXT: vzeroupper
1219
1192
; AVX512VL-NEXT: retq
1220
1193
;
1221
1194
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
1222
1195
; AVX512BW: # %bb.0:
1223
- ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1224
- ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
1225
- ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1226
- ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1227
- ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1228
- ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1196
+ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1197
+ ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1229
1198
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1199
+ ; AVX512BW-NEXT: vzeroupper
1230
1200
; AVX512BW-NEXT: retq
1231
1201
;
1232
1202
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
1233
1203
; AVX512BWVL: # %bb.0:
1234
- ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
1235
- ; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
1236
- ; AVX512BWVL-NEXT: vpmovqb %xmm1, %xmm1
1237
- ; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0
1238
- ; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1204
+ ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1205
+ ; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
1239
1206
; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
1207
+ ; AVX512BWVL-NEXT: vzeroupper
1240
1208
; AVX512BWVL-NEXT: retq
1241
1209
;
1242
1210
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
1243
1211
; AVX512VBMIVL: # %bb.0:
1244
- ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
1245
- ; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
1246
- ; AVX512VBMIVL-NEXT: vpmovqb %xmm1, %xmm1
1247
- ; AVX512VBMIVL-NEXT: vpmovqb %xmm0, %xmm0
1248
- ; AVX512VBMIVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1212
+ ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1213
+ ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
1249
1214
; AVX512VBMIVL-NEXT: vmovd %xmm0, (%rsi)
1215
+ ; AVX512VBMIVL-NEXT: vzeroupper
1250
1216
; AVX512VBMIVL-NEXT: retq
1251
1217
%vec = load <32 x i8 >, <32 x i8 >* %L
1252
1218
%strided.vec = shufflevector <32 x i8 > %vec , <32 x i8 > undef , <4 x i32 > <i32 0 , i32 8 , i32 16 , i32 24 >
0 commit comments