Skip to content

Commit 34013e7

Browse files
committed
[X86] Add shuffle tests for BLEND(PERMUTE(X),PERMUTE(Y)) patterns
Some very basic tests for a case where we could fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y)) These assume the permute masks are the same, and "complete" (no undefs/duplicate elements) but we could relax that depending on the blend mask
1 parent c309dc6 commit 34013e7

File tree

3 files changed

+90
-0
lines changed

3 files changed

+90
-0
lines changed

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,37 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
305305
ret <4 x float> %2
306306
}
307307

308+
define <8 x i32> @combine_blend_of_permutes_v8i32(<4 x i64> %a0, <4 x i64> %a1) {
309+
; AVX1-LABEL: combine_blend_of_permutes_v8i32:
310+
; AVX1: # %bb.0:
311+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
312+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
313+
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7]
314+
; AVX1-NEXT: ret{{[l|q]}}
315+
;
316+
; AVX2-LABEL: combine_blend_of_permutes_v8i32:
317+
; AVX2: # %bb.0:
318+
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
319+
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
320+
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7]
321+
; AVX2-NEXT: ret{{[l|q]}}
322+
;
323+
; AVX512-LABEL: combine_blend_of_permutes_v8i32:
324+
; AVX512: # %bb.0:
325+
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
326+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
327+
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,21,6,23,16,1,2,19]
328+
; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
329+
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
330+
; AVX512-NEXT: ret{{[l|q]}}
331+
%s0 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
332+
%s1 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
333+
%x0 = bitcast <4 x i64> %s0 to <8 x i32>
334+
%x1 = bitcast <4 x i64> %s1 to <8 x i32>
335+
%r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 12, i32 5, i32 6, i32 15>
336+
ret <8 x i32> %r
337+
}
338+
308339
define <2 x double> @constant_fold_vpermilvar_pd() {
309340
; CHECK-LABEL: constant_fold_vpermilvar_pd:
310341
; CHECK: # %bb.0:

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,3 +973,47 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
973973
%2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer)
974974
ret <8 x i64> %2
975975
}
976+
977+
define <16 x i32> @blend_of_permutes_v16i32(<8 x i64> %a0, <8x i64> %a1) {
978+
; X86-AVX512F-LABEL: blend_of_permutes_v16i32:
979+
; X86-AVX512F: # %bb.0:
980+
; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
981+
; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
982+
; X86-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A
983+
; X86-AVX512F-NEXT: kmovw %eax, %k1
984+
; X86-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
985+
; X86-AVX512F-NEXT: retl
986+
;
987+
; X86-AVX512BW-LABEL: blend_of_permutes_v16i32:
988+
; X86-AVX512BW: # %bb.0:
989+
; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
990+
; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
991+
; X86-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A
992+
; X86-AVX512BW-NEXT: kmovd %eax, %k1
993+
; X86-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
994+
; X86-AVX512BW-NEXT: retl
995+
;
996+
; X64-AVX512F-LABEL: blend_of_permutes_v16i32:
997+
; X64-AVX512F: # %bb.0:
998+
; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
999+
; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
1000+
; X64-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A
1001+
; X64-AVX512F-NEXT: kmovw %eax, %k1
1002+
; X64-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1003+
; X64-AVX512F-NEXT: retq
1004+
;
1005+
; X64-AVX512BW-LABEL: blend_of_permutes_v16i32:
1006+
; X64-AVX512BW: # %bb.0:
1007+
; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
1008+
; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
1009+
; X64-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A
1010+
; X64-AVX512BW-NEXT: kmovd %eax, %k1
1011+
; X64-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1012+
; X64-AVX512BW-NEXT: retq
1013+
%s0 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
1014+
%s1 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
1015+
%x0 = bitcast <8 x i64> %s0 to <16 x i32>
1016+
%x1 = bitcast <8 x i64> %s1 to <16 x i32>
1017+
%r = shufflevector <16 x i32> %x0, <16 x i32> %x1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
1018+
ret <16 x i32> %r
1019+
}

llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,21 @@ define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) {
2222
ret <16 x i8> %res0
2323
}
2424

25+
define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) {
26+
; SSE-LABEL: combine_blend_of_permutes_v4i32:
27+
; SSE: # %bb.0:
28+
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
29+
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
30+
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
31+
; SSE-NEXT: retq
32+
%s0 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
33+
%s1 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
34+
%x0 = bitcast <2 x i64> %s0 to <4 x i32>
35+
%x1 = bitcast <2 x i64> %s1 to <4 x i32>
36+
%r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
37+
ret <4 x i32> %r
38+
}
39+
2540
define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
2641
; SSE-LABEL: PR50049:
2742
; SSE: # %bb.0:

0 commit comments

Comments
 (0)