ggml : support AVX512VNNI (#6280)

jart · web-flow · commit 7733f0c76081 · 2024-03-25T07:39:56.000+02:00
This change causes some quants (e.g. Q4_0, Q8_0) to go faster on some
architectures (e.g. AMD Zen 4).
diff --git a/ggml-quants.c b/ggml-quants.c
@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 }
 
 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if __AVXVNNI__
+#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
     const __m256i zero = _mm256_setzero_si256();
     const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
     return _mm256_cvtepi32_ps(summed_pairs);

Original file line number	Diff line number	Diff line change
`@@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {`
`132`	`132`	`}`
`133`	`133`
`134`	`134`	`static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {`
`135`		`-#if __AVXVNNI__`
	`135`	`+#if defined(__AVXVNNI__) \|\| defined(__AVX512VNNI__)`
`136`	`136`	`const __m256i zero = _mm256_setzero_si256();`
`137`	`137`	`const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);`
`138`	`138`	`return _mm256_cvtepi32_ps(summed_pairs);`