Skip to content

Commit 4f80340

Browse files
committed
[X86][SSE] Add tests for permute(phaddw(phaddw(x,y),phaddw(z,w))) -> phaddw(phaddw(),phaddw()) folds.
We currently only fold if NumEltsPerLane == 4
1 parent db13f83 commit 4f80340

File tree

1 file changed

+54
-1
lines changed

1 file changed

+54
-1
lines changed

llvm/test/CodeGen/X86/horizontal-shuffle-4.ll

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,21 @@ define <16 x i8> @permute_packss_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i
3636
ret <16 x i8> %4
3737
}
3838

39+
define <8 x i16> @permute_phadd_phadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
40+
; CHECK-LABEL: permute_phadd_phadd_128:
41+
; CHECK: ## %bb.0:
42+
; CHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0
43+
; CHECK-NEXT: vphaddw %xmm3, %xmm2, %xmm1
44+
; CHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0
45+
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
46+
; CHECK-NEXT: ret{{[l|q]}}
47+
%1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
48+
%2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a2, <8 x i16> %a3)
49+
%3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2)
50+
%4 = shufflevector <8 x i16> %3, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
51+
ret <8 x i16> %4
52+
}
53+
3954
;
4055
; 256-bit Vectors
4156
;
@@ -55,9 +70,47 @@ define <8 x float> @permute_hadd_hadd_256(<8 x float> %a0, <8 x float> %a1, <8 x
5570
ret <8 x float> %4
5671
}
5772

58-
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>)
73+
define <16 x i16> @permute_phadd_phadd_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
74+
; CHECK-LABEL: permute_phadd_phadd_256:
75+
; CHECK: ## %bb.0:
76+
; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
77+
; CHECK-NEXT: vphaddw %ymm3, %ymm2, %ymm1
78+
; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
79+
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
80+
; CHECK-NEXT: ret{{[l|q]}}
81+
%1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
82+
%2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a2, <16 x i16> %a3)
83+
%3 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %1, <16 x i16> %2)
84+
%4 = shufflevector <16 x i16> %3, <16 x i16> poison, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9>
85+
ret <16 x i16> %4
86+
}
87+
88+
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
89+
declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
90+
declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
91+
declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
92+
93+
declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>)
94+
declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>)
95+
declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>)
96+
declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>)
5997

6098
declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
6199
declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
62100
declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
63101
declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
102+
103+
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>)
104+
declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>)
105+
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>)
106+
declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>)
107+
108+
declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>)
109+
declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>)
110+
declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>)
111+
declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>)
112+
113+
declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
114+
declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
115+
declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
116+
declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)

0 commit comments

Comments
 (0)