Skip to content

Commit ce4801d

Browse files
committed
[X86] vpdpwssd.ll - add test coverage for #118443
1 parent f081ffe commit ce4801d

File tree

1 file changed

+166
-3
lines changed

1 file changed

+166
-3
lines changed

llvm/test/CodeGen/X86/vpdpwssd.ll

Lines changed: 166 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s
4-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512-VNNI
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512VL-VNNI
56

67
define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
78
; CHECK-LABEL: vpdpwssd_test:
@@ -11,3 +12,165 @@ define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
1112
%4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
1213
ret <16 x i32> %4
1314
}
15+
16+
define <16 x i32> @vpdpwssd_v16i32_accumulate(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> %a2) {
17+
; ZNVER-LABEL: vpdpwssd_v16i32_accumulate:
18+
; ZNVER: # %bb.0:
19+
; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm3
20+
; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm4
21+
; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm0
22+
; ZNVER-NEXT: vextracti64x4 $1, %zmm1, %ymm1
23+
; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
24+
; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
25+
; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
26+
; ZNVER-NEXT: vpmulld %zmm4, %zmm3, %zmm3
27+
; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
28+
; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
29+
; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm5
30+
; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
31+
; ZNVER-NEXT: vpaddd %zmm2, %zmm5, %zmm0
32+
; ZNVER-NEXT: vpaddd %zmm4, %zmm0, %zmm0
33+
; ZNVER-NEXT: retq
34+
;
35+
; AVX512-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
36+
; AVX512-VNNI: # %bb.0:
37+
; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
38+
; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
39+
; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
40+
; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
41+
; AVX512-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
42+
; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
43+
; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
44+
; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
45+
; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
46+
; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
47+
; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
48+
; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
49+
; AVX512-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
50+
; AVX512-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
51+
; AVX512-VNNI-NEXT: retq
52+
;
53+
; AVX512VL-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
54+
; AVX512VL-VNNI: # %bb.0:
55+
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
56+
; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
57+
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
58+
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
59+
; AVX512VL-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
60+
; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
61+
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
62+
; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
63+
; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
64+
; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
65+
; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
66+
; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
67+
; AVX512VL-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
68+
; AVX512VL-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
69+
; AVX512VL-VNNI-NEXT: retq
70+
%x0 = sext <32 x i16> %a0 to <32 x i32>
71+
%x1 = sext <32 x i16> %a1 to <32 x i32>
72+
%m = mul nsw <32 x i32> %x0, %x1
73+
%lo = shufflevector <32 x i32> %m, <32 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
74+
%hi = shufflevector <32 x i32> %m, <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
75+
%r0 = add <16 x i32> %lo, %a2
76+
%r1 = add <16 x i32> %r0, %hi
77+
ret <16 x i32> %r1
78+
}
79+
80+
define <8 x i32> @vpdpwssd_v8i32_accumulate(<16 x i16> %a0, <16 x i16> %a1, <8 x i32> %a2) {
81+
; ZNVER-LABEL: vpdpwssd_v8i32_accumulate:
82+
; ZNVER: # %bb.0:
83+
; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
84+
; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
85+
; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
86+
; ZNVER-NEXT: vpmovqd %zmm0, %ymm1
87+
; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm3
88+
; ZNVER-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
89+
; ZNVER-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
90+
; ZNVER-NEXT: vpaddd %ymm2, %ymm0, %ymm0
91+
; ZNVER-NEXT: vpaddd %ymm0, %ymm1, %ymm0
92+
; ZNVER-NEXT: retq
93+
;
94+
; AVX512-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
95+
; AVX512-VNNI: # %bb.0:
96+
; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
97+
; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
98+
; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
99+
; AVX512-VNNI-NEXT: vpmovqd %zmm0, %ymm1
100+
; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
101+
; AVX512-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
102+
; AVX512-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
103+
; AVX512-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
104+
; AVX512-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
105+
; AVX512-VNNI-NEXT: retq
106+
;
107+
; AVX512VL-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
108+
; AVX512VL-VNNI: # %bb.0:
109+
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
110+
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
111+
; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
112+
; AVX512VL-VNNI-NEXT: vpmovqd %zmm0, %ymm1
113+
; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
114+
; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
115+
; AVX512VL-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
116+
; AVX512VL-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
117+
; AVX512VL-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
118+
; AVX512VL-VNNI-NEXT: retq
119+
%x0 = sext <16 x i16> %a0 to <16 x i32>
120+
%x1 = sext <16 x i16> %a1 to <16 x i32>
121+
%m = mul nsw <16 x i32> %x0, %x1
122+
%lo = shufflevector <16 x i32> %m, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
123+
%hi = shufflevector <16 x i32> %m, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
124+
%r0 = add <8 x i32> %hi, %a2
125+
%r1 = add <8 x i32> %lo, %r0
126+
ret <8 x i32> %r1
127+
}
128+
129+
define <4 x i32> @vpdpwssd_v4i32_accumulate(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
130+
; ZNVER-LABEL: vpdpwssd_v4i32_accumulate:
131+
; ZNVER: # %bb.0:
132+
; ZNVER-NEXT: vpmovsxwd %xmm0, %ymm0
133+
; ZNVER-NEXT: vpmovsxwd %xmm1, %ymm1
134+
; ZNVER-NEXT: vpmulld %ymm1, %ymm0, %ymm0
135+
; ZNVER-NEXT: vpmovqd %ymm0, %xmm1
136+
; ZNVER-NEXT: vextracti128 $1, %ymm0, %xmm3
137+
; ZNVER-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
138+
; ZNVER-NEXT: vpaddd %xmm2, %xmm1, %xmm1
139+
; ZNVER-NEXT: vpaddd %xmm1, %xmm0, %xmm0
140+
; ZNVER-NEXT: vzeroupper
141+
; ZNVER-NEXT: retq
142+
;
143+
; AVX512-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
144+
; AVX512-VNNI: # %bb.0:
145+
; AVX512-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
146+
; AVX512-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
147+
; AVX512-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
148+
; AVX512-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
149+
; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm1[0,2]
150+
; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
151+
; AVX512-VNNI-NEXT: vpaddd %xmm2, %xmm3, %xmm1
152+
; AVX512-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
153+
; AVX512-VNNI-NEXT: vzeroupper
154+
; AVX512-VNNI-NEXT: retq
155+
;
156+
; AVX512VL-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
157+
; AVX512VL-VNNI: # %bb.0:
158+
; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
159+
; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
160+
; AVX512VL-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
161+
; AVX512VL-VNNI-NEXT: vpmovqd %ymm0, %xmm1
162+
; AVX512VL-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm3
163+
; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
164+
; AVX512VL-VNNI-NEXT: vpaddd %xmm2, %xmm1, %xmm1
165+
; AVX512VL-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
166+
; AVX512VL-VNNI-NEXT: vzeroupper
167+
; AVX512VL-VNNI-NEXT: retq
168+
%x0 = sext <8 x i16> %a0 to <8 x i32>
169+
%x1 = sext <8 x i16> %a1 to <8 x i32>
170+
%m = mul nsw <8 x i32> %x0, %x1
171+
%lo = shufflevector <8 x i32> %m, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
172+
%hi = shufflevector <8 x i32> %m, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
173+
%r0 = add <4 x i32> %lo, %a2
174+
%r1 = add <4 x i32> %hi, %r0
175+
ret <4 x i32> %r1
176+
}

0 commit comments

Comments
 (0)