@@ -1958,87 +1958,72 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
1958
1958
// Horizontal sum of all lanes of the accumulator
1959
1959
sumf = _mm512_reduce_add_ps ( acc0 ) + _mm512_reduce_add_ps ( acc1 );
1960
1960
#elif defined(__AVX2__ )
1961
- // Input: 32 Nibbles (16 bytes) at *p0
1962
- // Output: 2 vectors with 16 values of type int16_t
1963
- #define EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS (OUT_HIGH ,OUT_LOW ,IN_SRC ) \
1964
- /* get first input */ \
1965
- /* Load 16 bytes from memory */ \
1966
- const __m128i tmp_##OUT_HIGH = \
1967
- _mm_loadu_si128( (const __m128i_u *) IN_SRC); \
1968
- \
1969
- /* Expand bytes into uint16_t values */ \
1970
- const __m256i bytes_ ##OUT_HIGH = _mm256_cvtepu8_epi16(tmp_##OUT_HIGH); \
1971
- \
1972
- /* Unpack values into individual bytes */ \
1973
- const __m256i pre_shift_ ##OUT_HIGH = \
1974
- _mm256_andnot_si256( lowMask, bytes_##OUT_HIGH ); \
1975
- __m256i OUT_HIGH = _mm256_srli_epi16( pre_shift_##OUT_HIGH, 4 ); \
1976
- \
1977
- __m256i OUT_LOW = _mm256_and_si256( lowMask, bytes_##OUT_HIGH ); \
1978
- /* Now we have a vector with bytes in [ 0 .. 15 ] interval.
1979
- Offset them into [ -8 .. +7 ] interval. */ \
1980
- OUT_HIGH = _mm256_sub_epi16 ( OUT_HIGH , offset_8 ); \
1981
- OUT_LOW = _mm256_sub_epi16 ( OUT_LOW , offset_8 );
1982
-
1983
-
1984
- // Input: 32 Nibbles (16 bytes) at *p0
1985
- // Output: 2 vectors with 16 values of type int16_t
1986
- #define GET_SCALE_AND_QUANT_DOT_PRODUCT (SCALE , DOT , INDEX , OFFSET , ACC )\
1987
- /* Compute combined scale for the block */ \
1988
- const __m256 SCALE = _mm256_mul_ps( \
1989
- _mm256_broadcast_ss( &x[INDEX+OFFSET].d ), \
1990
- _mm256_broadcast_ss( &y[INDEX+OFFSET].d ) ); \
1991
- \
1992
- /* Compute the dot product of the quads*/ \
1993
- /* Input: 32 Nibbles (16 bytes) at *p0
1994
- Output: 2 vectors with 16 values of type int16_t */ \
1995
- EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS ( \
1996
- x_high_ ##DOT , \
1997
- x_low_##DOT, \
1998
- x[INDEX+OFFSET].qs) \
1999
- \
2000
- /* Input: 32 Nibbles (16 bytes) at *p1
2001
- Output: 2 vectors with 16 values of type int16_t */ \
2002
- EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS ( \
2003
- y_high_ ##DOT , \
2004
- y_low_##DOT, \
2005
- y[INDEX+OFFSET].qs) \
2006
- \
2007
- /* Compute products of int16_t integers, add pairwise */ \
2008
- __m256i x_y_high_ ##DOT = \
2009
- _mm256_madd_epi16( x_high_##DOT, y_high_##DOT ); \
2010
- \
2011
- __m256i x_y_low_##DOT = \
2012
- _mm256_madd_epi16( x_low_##DOT, y_low_##DOT ); \
2013
- \
2014
- /* Accumulate products of int16_t integers */ \
2015
- __m256i x_y_ ##DOT = _mm256_add_epi32( \
2016
- x_y_high_##DOT, \
2017
- x_y_low_##DOT ); \
2018
- \
2019
- /* Convert int32_t to float*/ \
2020
- __m256 DOT = _mm256_cvtepi32_ps ( x_y_ ##DOT ); \
2021
- ACC = _mm256_fmadd_ps ( SCALE , DOT , ACC );
2022
1961
1962
+ // Initialize accumulator with zeros
1963
+ __m256 acc = _mm256_setzero_ps ();
2023
1964
2024
- #define UNROLL_COUNT 8
1965
+ /* Prepare the constants we will need during execution */
1966
+ const __m256i lowMask = _mm256_set1_epi8 ( 0xF );
1967
+ const __m256i offset_8 = _mm256_set1_epi16 ( 8 );
2025
1968
1969
+ #define UNROLL_COUNT 8
2026
1970
// make sure we only unroll multiples of the block count
2027
1971
assert (nb % UNROLL_COUNT == 0 );
2028
1972
2029
- // Initialize accumulator with zeros
2030
- __m256 acc = _mm256_setzero_ps ();
2031
-
2032
1973
// Main loop
2033
1974
for (int i = 0 ; i < nb ; i += UNROLL_COUNT ) {
2034
1975
2035
- /* Prepare the constants we will need during execution */
2036
- const __m256i lowMask = _mm256_set1_epi8 ( 0xF );
2037
- const __m256i offset_8 = _mm256_set1_epi16 ( 8 );
2038
-
2039
- // This loop will be unrolled by the compiler
1976
+ // This loop will be unrolled by the compiler
2040
1977
for (int u = 0 ;u < UNROLL_COUNT ;u ++ ) {
2041
- GET_SCALE_AND_QUANT_DOT_PRODUCT (scale , q , i , u , acc );
1978
+ /* Compute combined scale for the block */
1979
+ const __m256 scale = _mm256_mul_ps (
1980
+ _mm256_broadcast_ss ( & x [i + u ].d ),
1981
+ _mm256_broadcast_ss ( & y [i + u ].d ) );
1982
+
1983
+ /* get input from x
1984
+ Input: 32 Nibbles (16 bytes) at *x[i+u]
1985
+ Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
1986
+
1987
+ /* Load 16 bytes from memory */
1988
+ const __m128i tmp_x = _mm_loadu_si128 ( (const __m128i_u * ) x [i + u ].qs );
1989
+ /* Expand bytes into uint16_t values */
1990
+ const __m256i bytes_x = _mm256_cvtepu8_epi16 (tmp_x );
1991
+ /* Unpack values into individual bytes */
1992
+ __m256i x_low_q = _mm256_and_si256 ( lowMask , bytes_x );
1993
+ const __m256i pre_shift_x_high_q = _mm256_andnot_si256 ( lowMask , bytes_x );
1994
+ __m256i x_high_q = _mm256_srli_epi16 ( pre_shift_x_high_q , 4 );
1995
+ /* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
1996
+ x_high_q = _mm256_sub_epi16 ( x_high_q , offset_8 );
1997
+ x_low_q = _mm256_sub_epi16 ( x_low_q , offset_8 );
1998
+
1999
+ /* get input from x
2000
+ Input: 32 Nibbles (16 bytes) at *x[i+u]
2001
+ Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
2002
+
2003
+ /* Load 16 bytes from memory */
2004
+ const __m128i tmp_y = _mm_loadu_si128 ( (const __m128i_u * ) y [i + u ].qs );
2005
+ /* Expand bytes into uint16_t values */
2006
+ const __m256i bytes_y = _mm256_cvtepu8_epi16 (tmp_y );
2007
+ /* Unpack values into individual bytes */
2008
+ const __m256i pre_shift_y_high_q = _mm256_andnot_si256 ( lowMask , bytes_y );
2009
+ __m256i y_high_q = _mm256_srli_epi16 ( pre_shift_y_high_q , 4 );
2010
+ __m256i y_low_q = _mm256_and_si256 ( lowMask , bytes_y );
2011
+ /* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
2012
+ y_high_q = _mm256_sub_epi16 ( y_high_q , offset_8 );
2013
+ y_low_q = _mm256_sub_epi16 ( y_low_q , offset_8 );
2014
+
2015
+ /* Compute products of int16_t integers, add pairwise, store as int32_t */
2016
+ __m256i xy_high_q = _mm256_madd_epi16 ( x_high_q , y_high_q );
2017
+ __m256i xy_low_q = _mm256_madd_epi16 ( x_low_q , y_low_q );
2018
+
2019
+ /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
2020
+ __m256i xy_q = _mm256_add_epi32 ( xy_high_q , xy_low_q );
2021
+
2022
+ /* Convert to vectore of 8 int32_t to 8 floats */
2023
+ __m256 q = _mm256_cvtepi32_ps ( xy_q );
2024
+
2025
+ /* Multiply q with scale and accumulate */
2026
+ acc = _mm256_fmadd_ps ( scale , q , acc );;
2042
2027
}
2043
2028
2044
2029
}
@@ -2070,7 +2055,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
2070
2055
bx = _mm_sub_epi8 ( bx , off );
2071
2056
by = _mm_sub_epi8 ( by , off );
2072
2057
2073
- // Get absolute values of x vectors
2058
+ // Get absolute values of x vectors
2074
2059
const __m128i ax = _mm_sign_epi8 (bx , bx );
2075
2060
2076
2061
// Sign the values of the y vectors
0 commit comments