@@ -272,154 +272,154 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
272
272
;
273
273
; AVX512BW-LABEL: var_funnnel_v64i8:
274
274
; AVX512BW: # %bb.0:
275
- ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
276
- ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
277
- ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
278
- ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
279
- ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm2
280
- ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
275
+ ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
276
+ ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
277
+ ; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
278
+ ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
281
279
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
282
- ; AVX512BW-NEXT: vpmovb2m %zmm2, %k2
283
- ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
284
- ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
285
- ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
286
- ; AVX512BW-NEXT: vpsrlw $2, %zmm2, %zmm5
287
- ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
288
- ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
289
- ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm5
280
+ ; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
281
+ ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
282
+ ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
283
+ ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
284
+ ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm5
290
285
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
286
+ ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
291
287
; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
292
288
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
293
- ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
294
- ; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
289
+ ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
290
+ ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
291
+ ; AVX512BW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
292
+ ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
295
293
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
296
- ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
297
- ; AVX512BW-NEXT: vpmovb2m %zmm3 , %k1
294
+ ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
295
+ ; AVX512BW-NEXT: vpmovb2m %zmm2 , %k1
298
296
; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
299
- ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
297
+ ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
300
298
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
301
299
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
302
- ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm1
300
+ ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
303
301
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
304
302
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
305
- ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
306
- ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
307
- ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
308
- ; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
303
+ ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
304
+ ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
305
+ ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
306
+ ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
307
+ ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
308
+ ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
309
309
; AVX512BW-NEXT: retq
310
310
;
311
311
; AVX512VLBW-LABEL: var_funnnel_v64i8:
312
312
; AVX512VLBW: # %bb.0:
313
- ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
314
- ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
315
- ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
316
- ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
317
- ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm2
318
- ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
313
+ ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
314
+ ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
315
+ ; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
316
+ ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
319
317
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
320
- ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k2
321
- ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2
322
- ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
323
- ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
324
- ; AVX512VLBW-NEXT: vpsrlw $2, %zmm2, %zmm5
325
- ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
326
- ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
327
- ; AVX512VLBW-NEXT: vpsrlw $1, %zmm2, %zmm5
318
+ ; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
319
+ ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
320
+ ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
321
+ ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
322
+ ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm5
328
323
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
324
+ ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
329
325
; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
330
326
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
331
- ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
332
- ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm1, %zmm1
327
+ ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
328
+ ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
329
+ ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
330
+ ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
333
331
; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
334
- ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
335
- ; AVX512VLBW-NEXT: vpmovb2m %zmm3 , %k1
332
+ ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
333
+ ; AVX512VLBW-NEXT: vpmovb2m %zmm2 , %k1
336
334
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
337
- ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
335
+ ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
338
336
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
339
337
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
340
- ; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm1
338
+ ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1
341
339
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
342
340
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
343
- ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
344
- ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
345
- ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
346
- ; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
341
+ ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1
342
+ ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
343
+ ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
344
+ ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
345
+ ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
346
+ ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
347
347
; AVX512VLBW-NEXT: retq
348
348
;
349
349
; AVX512VBMI2-LABEL: var_funnnel_v64i8:
350
350
; AVX512VBMI2: # %bb.0:
351
- ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
352
- ; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
353
- ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
354
- ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
355
- ; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
356
- ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
351
+ ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
352
+ ; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
353
+ ; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
354
+ ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
357
355
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
358
- ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k2
359
- ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
360
- ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
361
- ; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
362
- ; AVX512VBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
363
- ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
364
- ; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
365
- ; AVX512VBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
356
+ ; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2
357
+ ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
358
+ ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
359
+ ; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
360
+ ; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
366
361
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
362
+ ; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
367
363
; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
368
364
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
369
- ; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
370
- ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
365
+ ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
366
+ ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
367
+ ; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1
368
+ ; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
371
369
; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
372
- ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
373
- ; AVX512VBMI2-NEXT: vpmovb2m %zmm3 , %k1
370
+ ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
371
+ ; AVX512VBMI2-NEXT: vpmovb2m %zmm2 , %k1
374
372
; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2
375
- ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
373
+ ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
376
374
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
377
375
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
378
- ; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
376
+ ; AVX512VBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
379
377
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
380
378
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
381
- ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
382
- ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
383
- ; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
384
- ; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
379
+ ; AVX512VBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
380
+ ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
381
+ ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
382
+ ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
383
+ ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
384
+ ; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
385
385
; AVX512VBMI2-NEXT: retq
386
386
;
387
387
; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
388
388
; AVX512VLVBMI2: # %bb.0:
389
- ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
390
- ; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
391
- ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
392
- ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
393
- ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
394
- ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
389
+ ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
390
+ ; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
391
+ ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
392
+ ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
395
393
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
396
- ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k2
397
- ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
398
- ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
399
- ; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
400
- ; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
401
- ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
402
- ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
403
- ; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
394
+ ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2
395
+ ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
396
+ ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
397
+ ; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
398
+ ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
404
399
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
400
+ ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
405
401
; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
406
402
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
407
- ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
408
- ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
403
+ ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
404
+ ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
405
+ ; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1
406
+ ; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
409
407
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
410
- ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
411
- ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3 , %k1
408
+ ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
409
+ ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2 , %k1
412
410
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2
413
- ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
411
+ ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
414
412
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
415
413
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
416
- ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
414
+ ; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
417
415
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
418
416
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
419
- ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
420
- ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
421
- ; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
422
- ; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
417
+ ; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
418
+ ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
419
+ ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
420
+ ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
421
+ ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
422
+ ; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
423
423
; AVX512VLVBMI2-NEXT: retq
424
424
%res = call <64 x i8 > @llvm.fshl.v64i8 (<64 x i8 > %x , <64 x i8 > %x , <64 x i8 > %amt )
425
425
ret <64 x i8 > %res
0 commit comments