@@ -5654,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5654
5654
5655
5655
for (int i = 0 ; i < nb ; ++ i ) {
5656
5656
5657
- const float d = y [i ].d * ( float ) x [i ].d ;
5658
- const float dmin = - y [i ].d * ( float ) x [i ].dmin ;
5657
+ const float d = y [i ].d * GGML_FP16_TO_FP32 ( x [i ].d ) ;
5658
+ const float dmin = - y [i ].d * GGML_FP16_TO_FP32 ( x [i ].dmin ) ;
5659
5659
5660
5660
const uint8_t * restrict q2 = x [i ].qs ;
5661
5661
const int8_t * restrict q8 = y [i ].qs ;
@@ -5804,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5804
5804
5805
5805
for (int i = 0 ; i < nb ; ++ i ) {
5806
5806
5807
- const float d = y [i ].d * ( float ) x [i ].d ;
5808
- const float dmin = - y [i ].d * ( float ) x [i ].dmin ;
5807
+ const float d = y [i ].d * GGML_FP16_TO_FP32 ( x [i ].d ) ;
5808
+ const float dmin = - y [i ].d * GGML_FP16_TO_FP32 ( x [i ].dmin ) ;
5809
5809
5810
5810
const uint8_t * restrict q2 = x [i ].qs ;
5811
5811
const int8_t * restrict q8 = y [i ].qs ;
@@ -6458,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6458
6458
6459
6459
int32_t isum = -4 * (scales [0 ] * y [i ].bsums [0 ] + scales [2 ] * y [i ].bsums [1 ] + scales [1 ] * y [i ].bsums [2 ] + scales [3 ] * y [i ].bsums [3 ]);
6460
6460
6461
- const float d = y [i ].d * ( float ) x [i ].d ;
6461
+ const float d = y [i ].d * GGML_FP16_TO_FP32 ( x [i ].d ) ;
6462
6462
6463
6463
const uint8x16_t htmp = vcombine_u8 (hbits , vshr_n_u8 (hbits , 1 ));
6464
6464
q3h .val [0 ] = vandq_u8 (mh , vshlq_n_u8 (htmp , 2 ));
@@ -6660,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6660
6660
6661
6661
int32_t isum = -4 * (scales [0 ] * y [i ].bsums [0 ] + scales [2 ] * y [i ].bsums [1 ] + scales [1 ] * y [i ].bsums [2 ] + scales [3 ] * y [i ].bsums [3 ]);
6662
6662
6663
- const float d = y [i ].d * ( float ) x [i ].d ;
6663
+ const float d = y [i ].d * GGML_FP16_TO_FP32 ( x [i ].d ) ;
6664
6664
6665
6665
vint32m1_t vzero = __riscv_vmv_v_x_i32m1 (0 , 1 );
6666
6666
@@ -7163,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7163
7163
aux16 [1 ] = (a [0 ] >> 4 ) & 0x0f0f ;
7164
7164
7165
7165
const int32_t summi = scales [2 ] * (y [i ].bsums [0 ] + y [i ].bsums [1 ]) + scales [3 ] * (y [i ].bsums [2 ] + y [i ].bsums [3 ]);
7166
- sum_mins += y [i ].d * ( float ) x [i ].d [1 ] * summi ;
7166
+ sum_mins += y [i ].d * GGML_FP16_TO_FP32 ( x [i ].d [1 ]) * summi ;
7167
7167
7168
- const float d = y [i ].d * ( float ) x [i ].d [0 ];
7168
+ const float d = y [i ].d * GGML_FP16_TO_FP32 ( x [i ].d [0 ]) ;
7169
7169
7170
7170
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2 (q4 );
7171
7171
@@ -7823,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7823
7823
7824
7824
for (int i = 0 ; i < nb ; ++ i ) {
7825
7825
7826
- const float d = y [i ].d * ( float ) x [i ].d ;
7826
+ const float d = y [i ].d * GGML_FP16_TO_FP32 ( x [i ].d ) ;
7827
7827
const int8_t * sc = x [i ].scales ;
7828
7828
7829
7829
const uint8_t * restrict q5 = x [i ].qs ;
@@ -7965,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7965
7965
7966
7966
for (int i = 0 ; i < nb ; ++ i ) {
7967
7967
7968
- const float d = y [i ].d * ( float ) x [i ].d ;
7968
+ const float d = y [i ].d * GGML_FP16_TO_FP32 ( x [i ].d ) ;
7969
7969
const int8_t * sc = x [i ].scales ;
7970
7970
7971
7971
const uint8_t * restrict q5 = x [i ].qs ;
@@ -8533,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8533
8533
8534
8534
for (int i = 0 ; i < nb ; ++ i ) {
8535
8535
8536
- const float d_all = ( float ) x [i ].d ;
8536
+ const float d_all = GGML_FP16_TO_FP32 ( x [i ].d ) ;
8537
8537
8538
8538
const uint8_t * restrict q6 = x [i ].ql ;
8539
8539
const uint8_t * restrict qh = x [i ].qh ;
@@ -8704,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8704
8704
8705
8705
for (int i = 0 ; i < nb ; ++ i ) {
8706
8706
8707
- const float d_all = ( float ) x [i ].d ;
8707
+ const float d_all = GGML_FP16_TO_FP32 ( x [i ].d ) ;
8708
8708
8709
8709
const uint8_t * restrict q6 = x [i ].ql ;
8710
8710
const uint8_t * restrict qh = x [i ].qh ;
@@ -9523,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9523
9523
float sumf = 0 ;
9524
9524
9525
9525
for (int ib = 0 ; ib < nb ; ib += 2 ) {
9526
-
9527
9526
q4bits .val [0 ] = vld1q_u8 (x [ib + 0 ].qs );
9528
9527
q4bits .val [1 ] = vld1q_u8 (x [ib + 1 ].qs );
9529
9528
q8b .val [0 ] = vld1q_s8 (y [ib + 0 ].qs );
@@ -9539,8 +9538,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9539
9538
prod_1 = ggml_vdotq_s32 (ggml_vdotq_s32 (vdupq_n_s32 (0 ), q4b .val [0 ], q8b .val [0 ]), q4b .val [1 ], q8b .val [1 ]);
9540
9539
prod_2 = ggml_vdotq_s32 (ggml_vdotq_s32 (vdupq_n_s32 (0 ), q4b .val [2 ], q8b .val [2 ]), q4b .val [3 ], q8b .val [3 ]);
9541
9540
9542
- sumf += (float )x [ib + 0 ].d * (float )y [ib + 0 ].d * vaddvq_s32 (prod_1 ) + (float )x [ib + 1 ].d * (float )y [ib + 1 ].d * vaddvq_s32 (prod_2 );
9543
-
9541
+ sumf +=
9542
+ GGML_FP16_TO_FP32 (x [ib + 0 ].d ) * GGML_FP16_TO_FP32 (y [ib + 0 ].d ) * vaddvq_s32 (prod_1 ) +
9543
+ GGML_FP16_TO_FP32 (x [ib + 1 ].d ) * GGML_FP16_TO_FP32 (y [ib + 1 ].d ) * vaddvq_s32 (prod_2 );
9544
9544
}
9545
9545
9546
9546
* s = sumf ;
0 commit comments