@@ -2188,6 +2188,162 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
2188
2188
* s = sumf ;
2189
2189
}
2190
2190
2191
+ static void seap_ggml_vec_dot_q4_0 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy , const int tilesize_x , const int tilesize_y , const int rowlength , const int dst_stridelength ) {
2192
+ const int nb = n / QK ;
2193
+
2194
+ assert (n % QK == 0 );
2195
+ assert (nb % 2 == 0 );
2196
+
2197
+ const block_q4_0 * restrict x = vx ;
2198
+ const block_q4_0 * restrict y = vy ;
2199
+
2200
+ float sumf = 0.0 ;
2201
+
2202
+
2203
+ //#if defined(__AVX2__)
2204
+ #if 1
2205
+
2206
+ #define SEAP_TILESIZE_X 1
2207
+ #define SEAP_TILESIZE_Y 8
2208
+ #define UNROLL_COUNT 8/SEAP_TILESIZE_Y
2209
+ #undef SEAP_DEBUG
2210
+
2211
+ // Initialize accumulator with zeros
2212
+ __m256 acc [SEAP_TILESIZE_Y ]; // = 0; // _mm256_setzero_ps();
2213
+ for (int i = 0 ;i < SEAP_TILESIZE_Y ; i ++ ) {
2214
+ acc [i ] = _mm256_setzero_ps ();
2215
+ }
2216
+
2217
+
2218
+ /* Prepare the constants we will need during execution */
2219
+ const __m256i lowMask = _mm256_set1_epi8 ( 0xF );
2220
+ const __m256i offset_8 = _mm256_set1_epi16 ( 8 );
2221
+
2222
+ // make sure we only unroll multiples of the block count
2223
+ assert (nb % UNROLL_COUNT == 0 );
2224
+
2225
+ // Input: 32 Nibbles (16 bytes) at *p0
2226
+ // Output: 2 vectors with 16 values of type int16_t
2227
+ #define EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS (OUT_HIGH ,OUT_LOW ,IN_SRC ,INDEX ) \
2228
+ /* get first input */ \
2229
+ /* Load 16 bytes from memory */ \
2230
+ const __m128i tmp_##OUT_HIGH = \
2231
+ _mm_loadu_si128( (const __m128i_u *) IN_SRC); \
2232
+ \
2233
+ /* Expand bytes into uint16_t values */ \
2234
+ const __m256i bytes_ ##OUT_HIGH = _mm256_cvtepu8_epi16(tmp_##OUT_HIGH); \
2235
+ \
2236
+ /* Unpack values into individual bytes */ \
2237
+ const __m256i pre_shift_ ##OUT_HIGH = \
2238
+ _mm256_andnot_si256( lowMask, bytes_##OUT_HIGH ); \
2239
+ OUT_HIGH[INDEX] = _mm256_srli_epi16( pre_shift_##OUT_HIGH, 4 ); \
2240
+ \
2241
+ OUT_LOW[INDEX] = _mm256_and_si256( lowMask, bytes_##OUT_HIGH ); \
2242
+ /* Now we have a vector with bytes in [ 0 .. 15 ] interval.
2243
+ Offset them into [ -8 .. +7 ] interval. */ \
2244
+ OUT_HIGH [INDEX ] = _mm256_sub_epi16 ( OUT_HIGH [INDEX ], offset_8 ); \
2245
+ OUT_LOW [INDEX ] = _mm256_sub_epi16 ( OUT_LOW [INDEX ], offset_8 );
2246
+
2247
+
2248
+
2249
+ // Main loop
2250
+ for (int i = 0 ; i < nb ; i += UNROLL_COUNT ) {
2251
+
2252
+ // This loop will be unrolled by the compiler
2253
+ for (int u = 0 ;u < UNROLL_COUNT ;u ++ ) {
2254
+
2255
+ /* get input from x
2256
+ Input: 32 Nibbles (16 bytes) at *x[i+u]
2257
+ Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
2258
+ __m256i x_high_q [SEAP_TILESIZE_X ];
2259
+ __m256i x_low_q [SEAP_TILESIZE_X ];
2260
+ EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS (x_high_q , x_low_q , x [i + u ].qs ,0 )
2261
+
2262
+ __m256 scale [SEAP_TILESIZE_Y ];
2263
+
2264
+ for (int t = 0 ;t < SEAP_TILESIZE_Y ;t ++ ) {
2265
+ /* Compute combined scale for the block */
2266
+ scale [t ] = _mm256_mul_ps (
2267
+ _mm256_broadcast_ss ( & x [i + u ].d ),
2268
+ _mm256_broadcast_ss ( & y [i + u + t * rowlength ].d ) );
2269
+
2270
+ #ifdef SEAP_DEBUG
2271
+ void * p = & y [i + u + t * rowlength ];
2272
+ printf ("dot_ved i=%i,u=%i,t=%i = %li = %f \n" ,i ,u ,t , (long int )p , ((block_q4_0 * )(p ))-> d );
2273
+ #endif
2274
+
2275
+ /* get input from y
2276
+ Input: 32 Nibbles (16 bytes) at *y[i+u]
2277
+ Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
2278
+ __m256i y_high_q [SEAP_TILESIZE_Y ];
2279
+ __m256i y_low_q [SEAP_TILESIZE_Y ];
2280
+
2281
+ EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS (y_high_q , y_low_q , y [i + u + t * rowlength ].qs ,t )
2282
+
2283
+ /* Compute products of int16_t integers, add pairwise, store as int32_t */
2284
+ __m256i xy_high_q [SEAP_TILESIZE_Y ];
2285
+ xy_high_q [t ] = _mm256_madd_epi16 ( x_high_q [0 ], y_high_q [t ] );
2286
+ __m256i xy_low_q [SEAP_TILESIZE_Y ];
2287
+ xy_low_q [t ]= _mm256_madd_epi16 ( x_low_q [0 ], y_low_q [t ] );
2288
+
2289
+ /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
2290
+ __m256i xy_q [SEAP_TILESIZE_Y ];
2291
+ xy_q [t ] = _mm256_add_epi32 ( xy_high_q [t ], xy_low_q [t ] );
2292
+
2293
+ /* Convert to vectore of 8 int32_t to 8 floats */
2294
+ __m256 q [SEAP_TILESIZE_Y ];
2295
+ q [t ] = _mm256_cvtepi32_ps ( xy_q [t ] );
2296
+
2297
+ /* Multiply q with scale and accumulate */
2298
+ acc [t ] = _mm256_fmadd_ps ( scale [t ], q [t ], acc [t ] );
2299
+
2300
+ }
2301
+
2302
+ }
2303
+
2304
+ }
2305
+
2306
+ for (int t = 0 ;t < SEAP_TILESIZE_Y ;t ++ ) {
2307
+ // Return horizontal sum of the acc vector
2308
+ __m128 res = _mm256_extractf128_ps ( acc [t ], 1 );
2309
+ res = _mm_add_ps ( res , _mm256_castps256_ps128 ( acc [t ] ) );
2310
+ res = _mm_add_ps ( res , _mm_movehl_ps ( res , res ) );
2311
+ res = _mm_add_ss ( res , _mm_movehdup_ps ( res ) );
2312
+
2313
+ s [(t * dst_stridelength )] = _mm_cvtss_f32 ( res );
2314
+
2315
+ #ifdef SEAP_DEBUG
2316
+ void * p = & s [(t * dst_stridelength )];
2317
+ printf ("dot_vec dst[%i] @ %li = %f \n" ,t , (long int )p , (float * )(p ));
2318
+ #endif
2319
+ }
2320
+
2321
+ #else
2322
+ // scalar
2323
+ for (int i = 0 ; i < nb ; i ++ ) {
2324
+ const float d0 = x [i ].d ;
2325
+ const float d1 = y [i ].d ;
2326
+
2327
+ const uint8_t * restrict p0 = x [i ].qs ;
2328
+ const uint8_t * restrict p1 = y [i ].qs ;
2329
+
2330
+ for (int j = 0 ; j < QK /2 ; j ++ ) {
2331
+ const uint8_t v0 = p0 [j ];
2332
+ const uint8_t v1 = p1 [j ];
2333
+
2334
+ const float f0 = d0 * ((int8_t ) (v0 & 0xf ) - 8 );
2335
+ const float f1 = d0 * ((int8_t ) (v0 >> 4 ) - 8 );
2336
+
2337
+ const float f2 = d1 * ((int8_t ) (v1 & 0xf ) - 8 );
2338
+ const float f3 = d1 * ((int8_t ) (v1 >> 4 ) - 8 );
2339
+
2340
+ sumf += f0 * f2 + f1 * f3 ;
2341
+ }
2342
+ }
2343
+ * s = sumf ;
2344
+ #endif
2345
+ }
2346
+
2191
2347
static void ggml_vec_dot_q4_1 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy ) {
2192
2348
const int nb = n / QK ;
2193
2349
@@ -6718,9 +6874,43 @@ static void ggml_compute_forward_mul_mat_q_f32(
6718
6874
6719
6875
assert (ne00 % 32 == 0 );
6720
6876
6721
- for (int64_t ic = 0 ; ic < ne11 ; ++ ic ) {
6722
- vec_dot_q (ne00 , & dst_col [ic * ne0 ], src0_row , (void * ) (src1_col + ic * row_size ));
6877
+ if (ne11 < SEAP_TILESIZE_Y ) {
6878
+ // existing implementation tiled implementation
6879
+ for (int64_t ic = 0 ; ic < ne11 ; ++ ic ) {
6880
+ vec_dot_q (ne00 , & dst_col [ic * ne0 ], src0_row , (void * ) (src1_col + ic * row_size ));
6881
+ }
6882
+ } else {
6883
+ // tiled implementation
6884
+ if ((ne11 % SEAP_TILESIZE_Y ) != 0 ) {
6885
+ printf ("ne11=%i\n" ,ne11 );
6886
+ }
6887
+ assert ((ne11 % SEAP_TILESIZE_Y ) == 0 ); // make sure we have a multiple of the tilesize
6888
+
6889
+ for (int64_t ic = 0 ; ic < ne11 ; ic += SEAP_TILESIZE_Y ) {
6890
+ //vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
6891
+
6892
+ #ifdef SEAP_DEBUG
6893
+ for (int t = 0 ;t < SEAP_TILESIZE_Y ;t ++ ) {
6894
+ void * p = (src1_col + (ic + t )* row_size );
6895
+ printf ("%i = %li = %f \n" ,t , (long int )p , ((block_q4_0 * )(p ))-> d );
6896
+ }
6897
+ #endif
6898
+
6899
+ seap_ggml_vec_dot_q4_0 (ne00 , & dst_col [ic * ne0 ], src0_row , (void * ) (src1_col + ic * row_size ), SEAP_TILESIZE_X , SEAP_TILESIZE_Y , row_size /GGML_TYPE_SIZE [type ], ne0 );
6900
+
6901
+ #ifdef SEAP_DEBUG
6902
+ for (int t = 0 ;t < SEAP_TILESIZE_Y ;t ++ ) {
6903
+ void * p = & dst_col [(ic + t )* ne0 ];
6904
+ printf ("dst[%i] @ %li = %f \n" ,(ic + t )* ne0 , (long int )p , (float * )(p ));
6905
+ }
6906
+
6907
+ if (ic >=3 ) exit (0 );
6908
+ #endif
6909
+ }
6910
+
6911
+
6723
6912
}
6913
+
6724
6914
}
6725
6915
6726
6916
//int64_t t1 = ggml_time_us();
0 commit comments