1
1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
- ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER
3
- ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER
2
+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER,AVX512BW-VNNI
3
+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER,AVX-VNNI
4
4
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512-VNNI
5
5
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512VL-VNNI
6
6
@@ -16,56 +16,28 @@ define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
16
16
define <16 x i32 > @vpdpwssd_v16i32_accumulate (<32 x i16 > %a0 , <32 x i16 > %a1 , <16 x i32 > %a2 ) {
17
17
; ZNVER-LABEL: vpdpwssd_v16i32_accumulate:
18
18
; ZNVER: # %bb.0:
19
- ; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm3
20
- ; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm4
21
- ; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm0
22
- ; ZNVER-NEXT: vextracti64x4 $1, %zmm1, %ymm1
23
- ; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
24
- ; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
25
- ; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
26
- ; ZNVER-NEXT: vpmulld %zmm4, %zmm3, %zmm3
27
- ; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
28
- ; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
29
- ; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm5
30
- ; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
31
- ; ZNVER-NEXT: vpaddd %zmm2, %zmm5, %zmm0
32
- ; ZNVER-NEXT: vpaddd %zmm4, %zmm0, %zmm0
19
+ ; ZNVER-NEXT: vpdpwssd %zmm1, %zmm0, %zmm2
20
+ ; ZNVER-NEXT: vmovdqa64 %zmm2, %zmm0
33
21
; ZNVER-NEXT: retq
34
22
;
35
23
; AVX512-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
36
24
; AVX512-VNNI: # %bb.0:
37
- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
38
- ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
39
- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
40
- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
41
- ; AVX512-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
42
- ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
43
- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
44
- ; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
45
- ; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
46
- ; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
47
- ; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
48
- ; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
49
- ; AVX512-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
50
- ; AVX512-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
25
+ ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm3
26
+ ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm4
27
+ ; AVX512-VNNI-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3
28
+ ; AVX512-VNNI-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
29
+ ; AVX512-VNNI-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
30
+ ; AVX512-VNNI-NEXT: vpaddd %zmm2, %zmm0, %zmm0
51
31
; AVX512-VNNI-NEXT: retq
52
32
;
53
33
; AVX512VL-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
54
34
; AVX512VL-VNNI: # %bb.0:
55
- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
56
- ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
57
- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
58
- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
59
- ; AVX512VL-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
60
- ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
61
- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
62
- ; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
63
- ; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
64
- ; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
65
- ; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
66
- ; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
67
- ; AVX512VL-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
68
- ; AVX512VL-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
35
+ ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm3
36
+ ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm4
37
+ ; AVX512VL-VNNI-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3
38
+ ; AVX512VL-VNNI-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
39
+ ; AVX512VL-VNNI-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
40
+ ; AVX512VL-VNNI-NEXT: vpaddd %zmm2, %zmm0, %zmm0
69
41
; AVX512VL-VNNI-NEXT: retq
70
42
%x0 = sext <32 x i16 > %a0 to <32 x i32 >
71
43
%x1 = sext <32 x i16 > %a1 to <32 x i32 >
@@ -78,43 +50,28 @@ define <16 x i32> @vpdpwssd_v16i32_accumulate(<32 x i16> %a0, <32 x i16> %a1, <1
78
50
}
79
51
80
52
define <8 x i32 > @vpdpwssd_v8i32_accumulate (<16 x i16 > %a0 , <16 x i16 > %a1 , <8 x i32 > %a2 ) {
81
- ; ZNVER-LABEL: vpdpwssd_v8i32_accumulate:
82
- ; ZNVER: # %bb.0:
83
- ; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
84
- ; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
85
- ; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
86
- ; ZNVER-NEXT: vpmovqd %zmm0, %ymm1
87
- ; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm3
88
- ; ZNVER-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
89
- ; ZNVER-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
90
- ; ZNVER-NEXT: vpaddd %ymm2, %ymm0, %ymm0
91
- ; ZNVER-NEXT: vpaddd %ymm0, %ymm1, %ymm0
92
- ; ZNVER-NEXT: retq
53
+ ; AVX512BW-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
54
+ ; AVX512BW-VNNI: # %bb.0:
55
+ ; AVX512BW-VNNI-NEXT: vpdpwssd %ymm1, %ymm0, %ymm2
56
+ ; AVX512BW-VNNI-NEXT: vmovdqa %ymm2, %ymm0
57
+ ; AVX512BW-VNNI-NEXT: retq
58
+ ;
59
+ ; AVX-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
60
+ ; AVX-VNNI: # %bb.0:
61
+ ; AVX-VNNI-NEXT: {vex} vpdpwssd %ymm1, %ymm0, %ymm2
62
+ ; AVX-VNNI-NEXT: vmovdqa %ymm2, %ymm0
63
+ ; AVX-VNNI-NEXT: retq
93
64
;
94
65
; AVX512-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
95
66
; AVX512-VNNI: # %bb.0:
96
- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
97
- ; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
98
- ; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
99
- ; AVX512-VNNI-NEXT: vpmovqd %zmm0, %ymm1
100
- ; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
101
- ; AVX512-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
102
- ; AVX512-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
103
- ; AVX512-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
104
- ; AVX512-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
67
+ ; AVX512-VNNI-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
68
+ ; AVX512-VNNI-NEXT: vpaddd %ymm2, %ymm0, %ymm0
105
69
; AVX512-VNNI-NEXT: retq
106
70
;
107
71
; AVX512VL-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
108
72
; AVX512VL-VNNI: # %bb.0:
109
- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
110
- ; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
111
- ; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
112
- ; AVX512VL-VNNI-NEXT: vpmovqd %zmm0, %ymm1
113
- ; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
114
- ; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
115
- ; AVX512VL-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
116
- ; AVX512VL-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
117
- ; AVX512VL-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
73
+ ; AVX512VL-VNNI-NEXT: vpdpwssd %ymm1, %ymm0, %ymm2
74
+ ; AVX512VL-VNNI-NEXT: vmovdqa %ymm2, %ymm0
118
75
; AVX512VL-VNNI-NEXT: retq
119
76
%x0 = sext <16 x i16 > %a0 to <16 x i32 >
120
77
%x1 = sext <16 x i16 > %a1 to <16 x i32 >
@@ -127,43 +84,28 @@ define <8 x i32> @vpdpwssd_v8i32_accumulate(<16 x i16> %a0, <16 x i16> %a1, <8 x
127
84
}
128
85
129
86
define <4 x i32 > @vpdpwssd_v4i32_accumulate (<8 x i16 > %a0 , <8 x i16 > %a1 , <4 x i32 > %a2 ) {
130
- ; ZNVER-LABEL: vpdpwssd_v4i32_accumulate:
131
- ; ZNVER: # %bb.0:
132
- ; ZNVER-NEXT: vpmovsxwd %xmm0, %ymm0
133
- ; ZNVER-NEXT: vpmovsxwd %xmm1, %ymm1
134
- ; ZNVER-NEXT: vpmulld %ymm1, %ymm0, %ymm0
135
- ; ZNVER-NEXT: vpmovqd %ymm0, %xmm1
136
- ; ZNVER-NEXT: vextracti128 $1, %ymm0, %xmm3
137
- ; ZNVER-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
138
- ; ZNVER-NEXT: vpaddd %xmm2, %xmm1, %xmm1
139
- ; ZNVER-NEXT: vpaddd %xmm1, %xmm0, %xmm0
140
- ; ZNVER-NEXT: vzeroupper
141
- ; ZNVER-NEXT: retq
87
+ ; AVX512BW-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
88
+ ; AVX512BW-VNNI: # %bb.0:
89
+ ; AVX512BW-VNNI-NEXT: vpdpwssd %xmm1, %xmm0, %xmm2
90
+ ; AVX512BW-VNNI-NEXT: vmovdqa %xmm2, %xmm0
91
+ ; AVX512BW-VNNI-NEXT: retq
92
+ ;
93
+ ; AVX-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
94
+ ; AVX-VNNI: # %bb.0:
95
+ ; AVX-VNNI-NEXT: {vex} vpdpwssd %xmm1, %xmm0, %xmm2
96
+ ; AVX-VNNI-NEXT: vmovdqa %xmm2, %xmm0
97
+ ; AVX-VNNI-NEXT: retq
142
98
;
143
99
; AVX512-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
144
100
; AVX512-VNNI: # %bb.0:
145
- ; AVX512-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
146
- ; AVX512-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
147
- ; AVX512-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
148
- ; AVX512-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
149
- ; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm1[0,2]
150
- ; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
151
- ; AVX512-VNNI-NEXT: vpaddd %xmm2, %xmm3, %xmm1
152
- ; AVX512-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
153
- ; AVX512-VNNI-NEXT: vzeroupper
101
+ ; AVX512-VNNI-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
102
+ ; AVX512-VNNI-NEXT: vpaddd %xmm2, %xmm0, %xmm0
154
103
; AVX512-VNNI-NEXT: retq
155
104
;
156
105
; AVX512VL-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
157
106
; AVX512VL-VNNI: # %bb.0:
158
- ; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
159
- ; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
160
- ; AVX512VL-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
161
- ; AVX512VL-VNNI-NEXT: vpmovqd %ymm0, %xmm1
162
- ; AVX512VL-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm3
163
- ; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
164
- ; AVX512VL-VNNI-NEXT: vpaddd %xmm2, %xmm1, %xmm1
165
- ; AVX512VL-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
166
- ; AVX512VL-VNNI-NEXT: vzeroupper
107
+ ; AVX512VL-VNNI-NEXT: vpdpwssd %xmm1, %xmm0, %xmm2
108
+ ; AVX512VL-VNNI-NEXT: vmovdqa %xmm2, %xmm0
167
109
; AVX512VL-VNNI-NEXT: retq
168
110
%x0 = sext <8 x i16 > %a0 to <8 x i32 >
169
111
%x1 = sext <8 x i16 > %a1 to <8 x i32 >
0 commit comments