Skip to content

Commit a3a9ba8

Browse files
committed
[X86] lowerShuffleAsVTRUNC - ensure we peek through bitcasts when looking for freely-concatable subvectors
Fixes llvm#111611
1 parent 64421ec commit a3a9ba8

File tree

4 files changed

+33
-113
lines changed

4 files changed

+33
-113
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10356,7 +10356,7 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
1035610356
}
1035710357
return false;
1035810358
};
10359-
if (!IsCheapConcat(V1, V2))
10359+
if (!IsCheapConcat(peekThroughBitcasts(V1), peekThroughBitcasts(V2)))
1036010360
continue;
1036110361
}
1036210362

llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll

Lines changed: 8 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -797,69 +797,12 @@ define <16 x i8> @oddelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwin
797797
; AVX2-NEXT: vzeroupper
798798
; AVX2-NEXT: retq
799799
;
800-
; AVX512F-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
801-
; AVX512F: # %bb.0:
802-
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
803-
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
804-
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
805-
; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
806-
; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
807-
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
808-
; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
809-
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
810-
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
811-
; AVX512F-NEXT: vzeroupper
812-
; AVX512F-NEXT: retq
813-
;
814-
; AVX512VL-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
815-
; AVX512VL: # %bb.0:
816-
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
817-
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
818-
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
819-
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
820-
; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
821-
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
822-
; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
823-
; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
824-
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
825-
; AVX512VL-NEXT: vzeroupper
826-
; AVX512VL-NEXT: retq
827-
;
828-
; AVX512BW-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
829-
; AVX512BW: # %bb.0:
830-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
831-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
832-
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
833-
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
834-
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
835-
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
836-
; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0
837-
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
838-
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
839-
; AVX512BW-NEXT: vzeroupper
840-
; AVX512BW-NEXT: retq
841-
;
842-
; AVX512BWVL-ONLY-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
843-
; AVX512BWVL-ONLY: # %bb.0:
844-
; AVX512BWVL-ONLY-NEXT: vextracti64x4 $1, %zmm0, %ymm1
845-
; AVX512BWVL-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm2
846-
; AVX512BWVL-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
847-
; AVX512BWVL-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
848-
; AVX512BWVL-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
849-
; AVX512BWVL-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
850-
; AVX512BWVL-ONLY-NEXT: vpsrld $16, %ymm0, %ymm0
851-
; AVX512BWVL-ONLY-NEXT: vpmovdb %ymm0, %xmm0
852-
; AVX512BWVL-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
853-
; AVX512BWVL-ONLY-NEXT: vzeroupper
854-
; AVX512BWVL-ONLY-NEXT: retq
855-
;
856-
; AVX512VBMI-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
857-
; AVX512VBMI: # %bb.0:
858-
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
859-
; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
860-
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
861-
; AVX512VBMI-NEXT: vzeroupper
862-
; AVX512VBMI-NEXT: retq
800+
; AVX512-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
801+
; AVX512: # %bb.0:
802+
; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
803+
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
804+
; AVX512-NEXT: vzeroupper
805+
; AVX512-NEXT: retq
863806
%n0 = bitcast <32 x i16> %n2 to <64 x i8>
864807
%p = shufflevector <64 x i8> %n0, <64 x i8> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
865808
ret <16 x i8> %p
@@ -1548,3 +1491,5 @@ define <16 x i8> @oddelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
15481491
%n1 = trunc <16 x i16> %n0 to <16 x i8>
15491492
ret <16 x i8> %n1
15501493
}
1494+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1495+
; AVX512VBMI: {{.*}}

llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll

Lines changed: 19 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -389,57 +389,28 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) {
389389
; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
390390
; AVX512VL-FAST-PERLANE-NEXT: retq
391391
;
392-
; AVX512BW-FAST-ALL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
393-
; AVX512BW-FAST-ALL: # %bb.0:
394-
; AVX512BW-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,9,11]
395-
; AVX512BW-FAST-ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
396-
; AVX512BW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
397-
; AVX512BW-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
398-
; AVX512BW-FAST-ALL-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
399-
; AVX512BW-FAST-ALL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
400-
; AVX512BW-FAST-ALL-NEXT: retq
401-
;
402-
; AVX512BW-FAST-PERLANE-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
403-
; AVX512BW-FAST-PERLANE: # %bb.0:
404-
; AVX512BW-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm1
405-
; AVX512BW-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
406-
; AVX512BW-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
407-
; AVX512BW-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
408-
; AVX512BW-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
409-
; AVX512BW-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
410-
; AVX512BW-FAST-PERLANE-NEXT: retq
411-
;
412-
; AVX512BWVL-FAST-ALL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
413-
; AVX512BWVL-FAST-ALL: # %bb.0:
414-
; AVX512BWVL-FAST-ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
415-
; AVX512BWVL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
416-
; AVX512BWVL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
417-
; AVX512BWVL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,5,7]
418-
; AVX512BWVL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm2, %ymm0
419-
; AVX512BWVL-FAST-ALL-NEXT: retq
392+
; AVX512BW-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
393+
; AVX512BW: # %bb.0:
394+
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
395+
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
396+
; AVX512BW-NEXT: retq
420397
;
421-
; AVX512BWVL-FAST-PERLANE-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
422-
; AVX512BWVL-FAST-PERLANE: # %bb.0:
423-
; AVX512BWVL-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm1
424-
; AVX512BWVL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
425-
; AVX512BWVL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
426-
; AVX512BWVL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
427-
; AVX512BWVL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
428-
; AVX512BWVL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
429-
; AVX512BWVL-FAST-PERLANE-NEXT: retq
398+
; AVX512BWVL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
399+
; AVX512BWVL: # %bb.0:
400+
; AVX512BWVL-NEXT: vpsrlw $8, %zmm0, %zmm0
401+
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
402+
; AVX512BWVL-NEXT: retq
430403
;
431404
; AVX512VBMI-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
432405
; AVX512VBMI: # %bb.0:
433-
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15,97,99,101,103,105,107,109,111,17,19,21,23,25,27,29,31,113,115,117,119,121,123,125,127]
434-
; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm0, %zmm1
435-
; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,2,1,3]
406+
; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0
407+
; AVX512VBMI-NEXT: vpmovwb %zmm0, %ymm0
436408
; AVX512VBMI-NEXT: retq
437409
;
438410
; AVX512VBMIVL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
439411
; AVX512VBMIVL: # %bb.0:
440-
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
441-
; AVX512VBMIVL-NEXT: vpermb %zmm0, %zmm1, %zmm0
442-
; AVX512VBMIVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
412+
; AVX512VBMIVL-NEXT: vpsrlw $8, %zmm0, %zmm0
413+
; AVX512VBMIVL-NEXT: vpmovwb %zmm0, %ymm0
443414
; AVX512VBMIVL-NEXT: retq
444415
%bc = bitcast <32 x i16> %a0 to <64 x i8>
445416
%res = shufflevector <64 x i8> %bc, <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
@@ -523,3 +494,8 @@ define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
523494
ret <16 x i8> %result
524495
}
525496

497+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
498+
; AVX512BW-FAST-ALL: {{.*}}
499+
; AVX512BW-FAST-PERLANE: {{.*}}
500+
; AVX512BWVL-FAST-ALL: {{.*}}
501+
; AVX512BWVL-FAST-PERLANE: {{.*}}

llvm/test/CodeGen/X86/x86-interleaved-access.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -405,14 +405,13 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind {
405405
; AVX512-NEXT: vmovdqu (%rdi), %ymm0
406406
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
407407
; AVX512-NEXT: vpmovsxbw {{.*#+}} xmm2 = [3,7,11,15,7,15,6,7]
408-
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm3
408+
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
409409
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm2
410-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
411-
; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
412-
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm3
413-
; AVX512-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
414-
; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1
410+
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
415411
; AVX512-NEXT: vmovdqu (%rdi), %ymm2
412+
; AVX512-NEXT: vpsrld $16, %ymm2, %ymm3
413+
; AVX512-NEXT: vpmovdw %zmm3, %ymm3
414+
; AVX512-NEXT: vpaddb %xmm3, %xmm1, %xmm1
416415
; AVX512-NEXT: vpmovdw %zmm2, %ymm2
417416
; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0
418417
; AVX512-NEXT: vpmovwb %zmm0, %ymm0

0 commit comments

Comments
 (0)