Skip to content

Commit dc1c5ae

Browse files
committed
Experimental code that achives 30k FLOPS
1 parent 437e778 commit dc1c5ae

File tree

1 file changed

+192
-2
lines changed

1 file changed

+192
-2
lines changed

ggml.c

Lines changed: 192 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2188,6 +2188,162 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
21882188
*s = sumf;
21892189
}
21902190

2191+
static void seap_ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy, const int tilesize_x, const int tilesize_y, const int rowlength, const int dst_stridelength) {
2192+
const int nb = n / QK;
2193+
2194+
assert(n % QK == 0);
2195+
assert(nb % 2 == 0);
2196+
2197+
const block_q4_0 * restrict x = vx;
2198+
const block_q4_0 * restrict y = vy;
2199+
2200+
float sumf = 0.0;
2201+
2202+
2203+
//#if defined(__AVX2__)
2204+
#if 1
2205+
2206+
#define SEAP_TILESIZE_X 1
2207+
#define SEAP_TILESIZE_Y 8
2208+
#define UNROLL_COUNT 8/SEAP_TILESIZE_Y
2209+
#undef SEAP_DEBUG
2210+
2211+
// Initialize accumulator with zeros
2212+
__m256 acc[SEAP_TILESIZE_Y]; // = 0; // _mm256_setzero_ps();
2213+
for (int i=0;i<SEAP_TILESIZE_Y; i++) {
2214+
acc[i] = _mm256_setzero_ps();
2215+
}
2216+
2217+
2218+
/* Prepare the constants we will need during execution */
2219+
const __m256i lowMask = _mm256_set1_epi8( 0xF );
2220+
const __m256i offset_8 = _mm256_set1_epi16( 8 );
2221+
2222+
// make sure we only unroll multiples of the block count
2223+
assert(nb % UNROLL_COUNT == 0);
2224+
2225+
// Input: 32 Nibbles (16 bytes) at *p0
2226+
// Output: 2 vectors with 16 values of type int16_t
2227+
#define EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS(OUT_HIGH,OUT_LOW,IN_SRC,INDEX) \
2228+
/* get first input */ \
2229+
/* Load 16 bytes from memory */ \
2230+
const __m128i tmp_##OUT_HIGH = \
2231+
_mm_loadu_si128( (const __m128i_u *) IN_SRC); \
2232+
\
2233+
/* Expand bytes into uint16_t values */ \
2234+
const __m256i bytes_##OUT_HIGH = _mm256_cvtepu8_epi16(tmp_##OUT_HIGH); \
2235+
\
2236+
/* Unpack values into individual bytes */ \
2237+
const __m256i pre_shift_##OUT_HIGH = \
2238+
_mm256_andnot_si256( lowMask, bytes_##OUT_HIGH ); \
2239+
OUT_HIGH[INDEX] = _mm256_srli_epi16( pre_shift_##OUT_HIGH, 4 ); \
2240+
\
2241+
OUT_LOW[INDEX] = _mm256_and_si256( lowMask, bytes_##OUT_HIGH ); \
2242+
/* Now we have a vector with bytes in [ 0 .. 15 ] interval.
2243+
Offset them into [ -8 .. +7 ] interval. */ \
2244+
OUT_HIGH[INDEX] = _mm256_sub_epi16( OUT_HIGH[INDEX], offset_8 ); \
2245+
OUT_LOW[INDEX] = _mm256_sub_epi16( OUT_LOW[INDEX], offset_8 );
2246+
2247+
2248+
2249+
// Main loop
2250+
for (int i = 0; i < nb; i+=UNROLL_COUNT) {
2251+
2252+
// This loop will be unrolled by the compiler
2253+
for (int u=0;u<UNROLL_COUNT;u++) {
2254+
2255+
/* get input from x
2256+
Input: 32 Nibbles (16 bytes) at *x[i+u]
2257+
Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
2258+
__m256i x_high_q[SEAP_TILESIZE_X];
2259+
__m256i x_low_q[SEAP_TILESIZE_X];
2260+
EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS(x_high_q, x_low_q, x[i+u].qs,0)
2261+
2262+
__m256 scale[SEAP_TILESIZE_Y];
2263+
2264+
for (int t=0;t<SEAP_TILESIZE_Y;t++) {
2265+
/* Compute combined scale for the block */
2266+
scale[t] = _mm256_mul_ps(
2267+
_mm256_broadcast_ss( &x[i+u].d ),
2268+
_mm256_broadcast_ss( &y[i+u+t*rowlength].d ) );
2269+
2270+
#ifdef SEAP_DEBUG
2271+
void *p = &y[i+u+t*rowlength];
2272+
printf("dot_ved i=%i,u=%i,t=%i = %li = %f \n",i,u,t, (long int)p, ((block_q4_0 *)(p))->d);
2273+
#endif
2274+
2275+
/* get input from y
2276+
Input: 32 Nibbles (16 bytes) at *y[i+u]
2277+
Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
2278+
__m256i y_high_q[SEAP_TILESIZE_Y];
2279+
__m256i y_low_q[SEAP_TILESIZE_Y];
2280+
2281+
EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS(y_high_q, y_low_q, y[i+u+t*rowlength].qs,t)
2282+
2283+
/* Compute products of int16_t integers, add pairwise, store as int32_t */
2284+
__m256i xy_high_q[SEAP_TILESIZE_Y];
2285+
xy_high_q[t] = _mm256_madd_epi16( x_high_q[0], y_high_q[t] );
2286+
__m256i xy_low_q[SEAP_TILESIZE_Y];
2287+
xy_low_q[t]= _mm256_madd_epi16( x_low_q[0], y_low_q[t] );
2288+
2289+
/* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
2290+
__m256i xy_q[SEAP_TILESIZE_Y];
2291+
xy_q[t] = _mm256_add_epi32( xy_high_q[t], xy_low_q[t] );
2292+
2293+
/* Convert to vectore of 8 int32_t to 8 floats */
2294+
__m256 q[SEAP_TILESIZE_Y];
2295+
q[t] = _mm256_cvtepi32_ps( xy_q[t] );
2296+
2297+
/* Multiply q with scale and accumulate */
2298+
acc[t] = _mm256_fmadd_ps( scale[t], q[t], acc[t] );
2299+
2300+
}
2301+
2302+
}
2303+
2304+
}
2305+
2306+
for (int t=0;t<SEAP_TILESIZE_Y;t++) {
2307+
// Return horizontal sum of the acc vector
2308+
__m128 res = _mm256_extractf128_ps( acc[t], 1 );
2309+
res = _mm_add_ps( res, _mm256_castps256_ps128( acc[t] ) );
2310+
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
2311+
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
2312+
2313+
s[(t*dst_stridelength)] = _mm_cvtss_f32( res );
2314+
2315+
#ifdef SEAP_DEBUG
2316+
void *p = &s[(t*dst_stridelength)];
2317+
printf("dot_vec dst[%i] @ %li = %f \n",t, (long int)p, (float *)(p));
2318+
#endif
2319+
}
2320+
2321+
#else
2322+
// scalar
2323+
for (int i = 0; i < nb; i++) {
2324+
const float d0 = x[i].d;
2325+
const float d1 = y[i].d;
2326+
2327+
const uint8_t * restrict p0 = x[i].qs;
2328+
const uint8_t * restrict p1 = y[i].qs;
2329+
2330+
for (int j = 0; j < QK/2; j++) {
2331+
const uint8_t v0 = p0[j];
2332+
const uint8_t v1 = p1[j];
2333+
2334+
const float f0 = d0*((int8_t) (v0 & 0xf) - 8);
2335+
const float f1 = d0*((int8_t) (v0 >> 4) - 8);
2336+
2337+
const float f2 = d1*((int8_t) (v1 & 0xf) - 8);
2338+
const float f3 = d1*((int8_t) (v1 >> 4) - 8);
2339+
2340+
sumf += f0*f2 + f1*f3;
2341+
}
2342+
}
2343+
*s = sumf;
2344+
#endif
2345+
}
2346+
21912347
static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
21922348
const int nb = n / QK;
21932349

@@ -6718,9 +6874,43 @@ static void ggml_compute_forward_mul_mat_q_f32(
67186874

67196875
assert(ne00 % 32 == 0);
67206876

6721-
for (int64_t ic = 0; ic < ne11; ++ic) {
6722-
vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
6877+
if (ne11 < SEAP_TILESIZE_Y) {
6878+
// existing implementation tiled implementation
6879+
for (int64_t ic = 0; ic < ne11; ++ic) {
6880+
vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
6881+
}
6882+
} else {
6883+
// tiled implementation
6884+
if ((ne11 % SEAP_TILESIZE_Y) != 0) {
6885+
printf("ne11=%i\n",ne11);
6886+
}
6887+
assert((ne11 % SEAP_TILESIZE_Y) == 0); // make sure we have a multiple of the tilesize
6888+
6889+
for (int64_t ic = 0; ic < ne11; ic+=SEAP_TILESIZE_Y) {
6890+
//vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
6891+
6892+
#ifdef SEAP_DEBUG
6893+
for (int t=0;t<SEAP_TILESIZE_Y;t++) {
6894+
void *p = (src1_col + (ic+t)*row_size);
6895+
printf("%i = %li = %f \n",t, (long int)p, ((block_q4_0 *)(p))->d);
6896+
}
6897+
#endif
6898+
6899+
seap_ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size), SEAP_TILESIZE_X, SEAP_TILESIZE_Y, row_size/GGML_TYPE_SIZE[type], ne0);
6900+
6901+
#ifdef SEAP_DEBUG
6902+
for (int t=0;t<SEAP_TILESIZE_Y;t++) {
6903+
void *p = &dst_col[(ic+t)*ne0];
6904+
printf("dst[%i] @ %li = %f \n",(ic+t)*ne0, (long int)p, (float *)(p));
6905+
}
6906+
6907+
if (ic>=3) exit(0);
6908+
#endif
6909+
}
6910+
6911+
67236912
}
6913+
67246914
}
67256915

67266916
//int64_t t1 = ggml_time_us();

0 commit comments

Comments
 (0)