Skip to content

Commit b032920

Browse files
authored
[X86][AVX10.2] Support AVX10.2 VNNI FP16/INT8/INT16 new instructions (#101783)
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965
1 parent 77011b0 commit b032920

28 files changed

+8815
-251
lines changed

clang/include/clang/Basic/BuiltinsX86.def

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -773,18 +773,18 @@ TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512v
773773
TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
774774

775775
// AVX-VNNI-INT8
776-
TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
777-
TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
778-
TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
779-
TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
780-
TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
781-
TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
782-
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
783-
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
784-
TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
785-
TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
786-
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
787-
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
776+
TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
777+
TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
778+
TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
779+
TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
780+
TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
781+
TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
782+
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
783+
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
784+
TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
785+
TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
786+
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
787+
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
788788

789789
TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
790790
TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl")
@@ -1959,6 +1959,27 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
19591959
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
19601960
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
19611961

1962+
// AVX10.2 VNNI FP16
1963+
TARGET_BUILTIN(__builtin_ia32_vdpphps128, "V4fV4fV8xV8x", "ncV:128:", "avx10.2-256")
1964+
TARGET_BUILTIN(__builtin_ia32_vdpphps256, "V8fV8fV16xV16x", "ncV:256:", "avx10.2-256")
1965+
TARGET_BUILTIN(__builtin_ia32_vdpphps512, "V16fV16fV32xV32x", "ncV:512:", "avx10.2-512")
1966+
1967+
// AVX10.2 VNNI INT8
1968+
TARGET_BUILTIN(__builtin_ia32_vpdpbssd512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
1969+
TARGET_BUILTIN(__builtin_ia32_vpdpbssds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
1970+
TARGET_BUILTIN(__builtin_ia32_vpdpbsud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
1971+
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
1972+
TARGET_BUILTIN(__builtin_ia32_vpdpbuud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
1973+
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
1974+
1975+
// AVX10.2 VNNI INT16
1976+
TARGET_BUILTIN(__builtin_ia32_vpdpwsud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
1977+
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
1978+
TARGET_BUILTIN(__builtin_ia32_vpdpwusd512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
1979+
TARGET_BUILTIN(__builtin_ia32_vpdpwusds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
1980+
TARGET_BUILTIN(__builtin_ia32_vpdpwuud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
1981+
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
1982+
19621983
// AVX10.2 VMPSADBW
19631984
TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
19641985

@@ -2088,18 +2109,18 @@ TARGET_BUILTIN(__builtin_ia32_vsubph256_round, "V16xV16xV16xIi", "nV:256:", "avx
20882109
TARGET_BUILTIN(__builtin_ia32_vsubps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
20892110

20902111
// AVX-VNNI-INT16
2091-
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
2092-
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
2093-
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
2094-
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
2095-
TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
2096-
TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
2097-
TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
2098-
TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
2099-
TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
2100-
TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
2101-
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
2102-
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
2112+
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
2113+
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
2114+
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
2115+
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
2116+
TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
2117+
TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
2118+
TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
2119+
TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
2120+
TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
2121+
TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
2122+
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
2123+
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
21032124

21042125
// AVX-NE-CONVERT
21052126
TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")

clang/lib/Headers/avx10_2_512niintrin.h

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,35 @@
1616
#ifndef __AVX10_2_512NIINTRIN_H
1717
#define __AVX10_2_512NIINTRIN_H
1818

19+
#define __DEFAULT_FN_ATTRS \
20+
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
21+
__min_vector_width__(512)))
22+
23+
/* VNNI FP16 */
24+
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_dpph_ps(__m512 __W,
25+
__m512h __A,
26+
__m512h __B) {
27+
return (__m512)__builtin_ia32_vdpphps512((__v16sf)__W, (__v32hf)__A,
28+
(__v32hf)__B);
29+
}
30+
31+
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_dpph_ps(__m512 __W,
32+
__mmask16 __U,
33+
__m512h __A,
34+
__m512h __B) {
35+
return (__m512)__builtin_ia32_selectps_512(
36+
(__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), (__v16sf)__W);
37+
}
38+
39+
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
40+
__m512 __W,
41+
__m512h __A,
42+
__m512h __B) {
43+
return (__m512)__builtin_ia32_selectps_512(
44+
(__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B),
45+
(__v16sf)_mm512_setzero_ps());
46+
}
47+
1948
/* VMPSADBW */
2049
#define _mm512_mpsadbw_epu8(A, B, imm) \
2150
((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
@@ -31,5 +60,255 @@
3160
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
3261
(__v32hi)_mm512_setzero_si512()))
3362

63+
/* VNNI INT8 */
64+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
65+
__m512i __A,
66+
__m512i __B) {
67+
return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
68+
(__v16si)__B);
69+
}
70+
71+
static __inline__ __m512i __DEFAULT_FN_ATTRS
72+
_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
73+
return (__m512i)__builtin_ia32_selectd_512(
74+
__U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), (__v16si)__W);
75+
}
76+
77+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
78+
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
79+
return (__m512i)__builtin_ia32_selectd_512(
80+
__U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B),
81+
(__v16si)_mm512_setzero_si512());
82+
}
83+
84+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
85+
__m512i __A,
86+
__m512i __B) {
87+
return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
88+
(__v16si)__B);
89+
}
90+
91+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
92+
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
93+
return (__m512i)__builtin_ia32_selectd_512(
94+
__U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), (__v16si)__W);
95+
}
96+
97+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
98+
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
99+
return (__m512i)__builtin_ia32_selectd_512(
100+
__U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B),
101+
(__v16si)_mm512_setzero_si512());
102+
}
103+
104+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
105+
__m512i __A,
106+
__m512i __B) {
107+
return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
108+
(__v16si)__B);
109+
}
110+
111+
static __inline__ __m512i __DEFAULT_FN_ATTRS
112+
_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
113+
return (__m512i)__builtin_ia32_selectd_512(
114+
__U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), (__v16si)__W);
115+
}
116+
117+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
118+
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
119+
return (__m512i)__builtin_ia32_selectd_512(
120+
__U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B),
121+
(__v16si)_mm512_setzero_si512());
122+
}
123+
124+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
125+
__m512i __A,
126+
__m512i __B) {
127+
return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
128+
(__v16si)__B);
129+
}
130+
131+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
132+
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
133+
return (__m512i)__builtin_ia32_selectd_512(
134+
__U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), (__v16si)__W);
135+
}
136+
137+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
138+
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
139+
return (__m512i)__builtin_ia32_selectd_512(
140+
__U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B),
141+
(__v16si)_mm512_setzero_si512());
142+
}
143+
144+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
145+
__m512i __A,
146+
__m512i __B) {
147+
return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
148+
(__v16si)__B);
149+
}
150+
151+
static __inline__ __m512i __DEFAULT_FN_ATTRS
152+
_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
153+
return (__m512i)__builtin_ia32_selectd_512(
154+
__U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), (__v16si)__W);
155+
}
156+
157+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
158+
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
159+
return (__m512i)__builtin_ia32_selectd_512(
160+
__U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B),
161+
(__v16si)_mm512_setzero_si512());
162+
}
163+
164+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
165+
__m512i __A,
166+
__m512i __B) {
167+
return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
168+
(__v16si)__B);
169+
}
170+
171+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
172+
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
173+
return (__m512i)__builtin_ia32_selectd_512(
174+
__U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), (__v16si)__W);
175+
}
176+
177+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
178+
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
179+
return (__m512i)__builtin_ia32_selectd_512(
180+
__U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B),
181+
(__v16si)_mm512_setzero_si512());
182+
}
183+
184+
/* VNNI INT16 */
185+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
186+
__m512i __B,
187+
__m512i __C) {
188+
return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
189+
(__v16si)__C);
190+
}
191+
192+
static __inline__ __m512i __DEFAULT_FN_ATTRS
193+
_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
194+
return (__m512i)__builtin_ia32_selectd_512(
195+
(__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
196+
(__v16si)__A);
197+
}
198+
199+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
200+
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
201+
return (__m512i)__builtin_ia32_selectd_512(
202+
(__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
203+
(__v16si)_mm512_setzero_si512());
204+
}
205+
206+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
207+
__m512i __B,
208+
__m512i __C) {
209+
return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
210+
(__v16si)__C);
211+
}
212+
213+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
214+
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
215+
return (__m512i)__builtin_ia32_selectd_512(
216+
(__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
217+
(__v16si)__A);
218+
}
219+
220+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
221+
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
222+
return (__m512i)__builtin_ia32_selectd_512(
223+
(__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
224+
(__v16si)_mm512_setzero_si512());
225+
}
226+
227+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
228+
__m512i __B,
229+
__m512i __C) {
230+
return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
231+
(__v16si)__C);
232+
}
233+
234+
static __inline__ __m512i __DEFAULT_FN_ATTRS
235+
_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
236+
return (__m512i)__builtin_ia32_selectd_512(
237+
(__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
238+
(__v16si)__A);
239+
}
240+
241+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
242+
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
243+
return (__m512i)__builtin_ia32_selectd_512(
244+
(__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
245+
(__v16si)_mm512_setzero_si512());
246+
}
247+
248+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
249+
__m512i __B,
250+
__m512i __C) {
251+
return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
252+
(__v16si)__C);
253+
}
254+
255+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
256+
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
257+
return (__m512i)__builtin_ia32_selectd_512(
258+
(__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
259+
(__v16si)__A);
260+
}
261+
262+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
263+
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
264+
return (__m512i)__builtin_ia32_selectd_512(
265+
(__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
266+
(__v16si)_mm512_setzero_si512());
267+
}
268+
269+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
270+
__m512i __B,
271+
__m512i __C) {
272+
return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
273+
(__v16si)__C);
274+
}
275+
276+
static __inline__ __m512i __DEFAULT_FN_ATTRS
277+
_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
278+
return (__m512i)__builtin_ia32_selectd_512(
279+
(__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
280+
(__v16si)__A);
281+
}
282+
283+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
284+
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
285+
return (__m512i)__builtin_ia32_selectd_512(
286+
(__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
287+
(__v16si)_mm512_setzero_si512());
288+
}
289+
290+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
291+
__m512i __B,
292+
__m512i __C) {
293+
return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
294+
(__v16si)__C);
295+
}
296+
297+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
298+
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
299+
return (__m512i)__builtin_ia32_selectd_512(
300+
(__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
301+
(__v16si)__A);
302+
}
303+
304+
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32(
305+
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
306+
return (__m512i)__builtin_ia32_selectd_512(
307+
(__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
308+
(__v16si)_mm512_setzero_si512());
309+
}
310+
311+
#undef __DEFAULT_FN_ATTRS
312+
34313
#endif /* __SSE2__ */
35314
#endif /* __AVX10_2_512NIINTRIN_H */

0 commit comments

Comments
 (0)