@@ -915,7 +915,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
915
915
const __m256i rhs_vec_0123_1 = _mm256_shuffle_epi8 (signextendlut , _mm256_and_si256 (rhs_raw_vec_0123_1 , m4b )); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
916
916
const __m256i rhs_vec_4567_1 = _mm256_shuffle_epi8 (signextendlut , _mm256_and_si256 (rhs_raw_vec_4567_1 , m4b )); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
917
917
918
- const __m256i rhs_vec_0123_2 = _mm256_shuffle_epi8 (signextendlut , _mm256_and_si256 (_mm256_srli_epi16 (rhs_raw_vec_0123_0 , 4 ), m4b )); // B0(16-23) B1(16-23) B2(16-23) B3(16-23
918
+ const __m256i rhs_vec_0123_2 = _mm256_shuffle_epi8 (signextendlut , _mm256_and_si256 (_mm256_srli_epi16 (rhs_raw_vec_0123_0 , 4 ), m4b )); // B0(16-23) B1(16-23) B2(16-23) B3(16-23)
919
919
const __m256i rhs_vec_4567_2 = _mm256_shuffle_epi8 (signextendlut , _mm256_and_si256 (_mm256_srli_epi16 (rhs_raw_vec_4567_0 , 4 ), m4b )); // B4(16-23) B5(16-23) B6(16-23) B7(16-23)
920
920
const __m256i rhs_vec_0123_3 = _mm256_shuffle_epi8 (signextendlut , _mm256_and_si256 (_mm256_srli_epi16 (rhs_raw_vec_0123_1 , 4 ), m4b )); // B0(24-31) B1(24-31) B2(24-31) B3(24-31)
921
921
const __m256i rhs_vec_4567_3 = _mm256_shuffle_epi8 (signextendlut , _mm256_and_si256 (_mm256_srli_epi16 (rhs_raw_vec_4567_1 , 4 ), m4b )); // B4(24-31) B5(24-31) B6(24-31) B7(24-31)
@@ -2437,6 +2437,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
2437
2437
2438
2438
// Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
2439
2439
int anr = nr - nr %16 ; // Used to align nr with boundary of 16
2440
+
2440
2441
for (; y < anr / 4 ; y += 4 ) {
2441
2442
const block_q8_0x4 * a_ptrs [4 ];
2442
2443
@@ -2625,7 +2626,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
2625
2626
}
2626
2627
2627
2628
for (int64_t b = 0 ; b < nb ; b ++ ) {
2628
- // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
2629
+ // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
2629
2630
const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256 ((const __m256i * )(b_ptr [b ].qs ));
2630
2631
const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256 ((const __m256i * )(b_ptr [b ].qs + 32 ));
2631
2632
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256 ((const __m256i * )(b_ptr [b ].qs + 64 ));
0 commit comments