@@ -954,7 +954,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
954
954
const float d = amax / ((1 << 7 ) - 1 );
955
955
const float id = d ? 1.0f /d : 0.0f ;
956
956
957
- y [i ].xxxd = GGML_FP32_TO_FP16 (d );
957
+ y [i ].d = GGML_FP32_TO_FP16 (d );
958
958
959
959
int sum = 0 ;
960
960
@@ -969,7 +969,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
969
969
sum += y [i ].qs [QK8_1 /2 + j ];
970
970
}
971
971
972
- y [i ].xxxs = GGML_FP32_TO_FP16 (sum * d );
972
+ y [i ].s = GGML_FP32_TO_FP16 (sum * d );
973
973
}
974
974
}
975
975
@@ -997,7 +997,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
997
997
const float d = amax / ((1 << 7 ) - 1 );
998
998
const float id = d ? 1.0f /d : 0.0f ;
999
999
1000
- y [i ].xxxd = GGML_FP32_TO_FP16 (d );
1000
+ y [i ].d = GGML_FP32_TO_FP16 (d );
1001
1001
1002
1002
int32x4_t accv = vdupq_n_s32 (0 );
1003
1003
@@ -1013,7 +1013,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1013
1013
accv = vaddq_s32 (accv , vi );
1014
1014
}
1015
1015
1016
- y [i ].xxxs = GGML_FP32_TO_FP16 (d * vaddvq_s32 (accv ));
1016
+ y [i ].s = GGML_FP32_TO_FP16 (d * vaddvq_s32 (accv ));
1017
1017
}
1018
1018
#elif defined(__wasm_simd128__ )
1019
1019
for (int i = 0 ; i < nb ; i ++ ) {
@@ -1036,7 +1036,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1036
1036
const float d = amax / ((1 << 7 ) - 1 );
1037
1037
const float id = d ? 1.0f /d : 0.0f ;
1038
1038
1039
- y [i ].xxxd = GGML_FP32_TO_FP16 (d );
1039
+ y [i ].d = GGML_FP32_TO_FP16 (d );
1040
1040
1041
1041
v128_t accv = wasm_i32x4_splat (0 );
1042
1042
@@ -1052,7 +1052,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1052
1052
accv = wasm_i32x4_add (accv , vi );
1053
1053
}
1054
1054
1055
- y [i ].xxxs = GGML_FP32_TO_FP16 (
1055
+ y [i ].s = GGML_FP32_TO_FP16 (
1056
1056
d * (wasm_i32x4_extract_lane (accv , 0 ) +
1057
1057
wasm_i32x4_extract_lane (accv , 1 ) +
1058
1058
wasm_i32x4_extract_lane (accv , 2 ) +
@@ -1081,7 +1081,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1081
1081
1082
1082
// Quantize these floats
1083
1083
const float d = maxScalar / 127.f ;
1084
- y [i ].xxxd = GGML_FP32_TO_FP16 (d );
1084
+ y [i ].d = GGML_FP32_TO_FP16 (d );
1085
1085
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f ;
1086
1086
const __m256 mul = _mm256_set1_ps ( id );
1087
1087
@@ -1105,7 +1105,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1105
1105
1106
1106
#if defined(__AVX2__ )
1107
1107
// Compute the sum of the quants and set y[i].s
1108
- y [i ].xxxs = GGML_FP32_TO_FP16 (d * hsum_i32_8 (_mm256_add_epi32 (_mm256_add_epi32 (i0 , i1 ), _mm256_add_epi32 (i2 , i3 ))));
1108
+ y [i ].s = GGML_FP32_TO_FP16 (d * hsum_i32_8 (_mm256_add_epi32 (_mm256_add_epi32 (i0 , i1 ), _mm256_add_epi32 (i2 , i3 ))));
1109
1109
1110
1110
// Convert int32 to int16
1111
1111
i0 = _mm256_packs_epi32 ( i0 , i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
@@ -1135,7 +1135,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1135
1135
// Compute the sum of the quants and set y[i].s
1136
1136
const __m128i s0 = _mm_add_epi32 (_mm_add_epi32 (ni0 , ni1 ), _mm_add_epi32 (ni2 , ni3 ));
1137
1137
const __m128i s1 = _mm_add_epi32 (_mm_add_epi32 (ni4 , ni5 ), _mm_add_epi32 (ni6 , ni7 ));
1138
- y [i ].xxxs = GGML_FP32_TO_FP16 (d * hsum_i32_4 (_mm_add_epi32 (s0 , s1 )));
1138
+ y [i ].s = GGML_FP32_TO_FP16 (d * hsum_i32_4 (_mm_add_epi32 (s0 , s1 )));
1139
1139
1140
1140
// Convert int32 to int16
1141
1141
ni0 = _mm_packs_epi32 ( ni0 , ni1 );
@@ -1166,7 +1166,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1166
1166
const float d = amax / ((1 << 7 ) - 1 );
1167
1167
const float id = d ? 1.0f /d : 0.0f ;
1168
1168
1169
- y [i ].xxxd = GGML_FP32_TO_FP16 (d );
1169
+ y [i ].d = GGML_FP32_TO_FP16 (d );
1170
1170
1171
1171
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4 (v_x , id , vl );
1172
1172
@@ -1183,7 +1183,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
1183
1183
1184
1184
// set y[i].s
1185
1185
int sum = __riscv_vmv_x_s_i16m1_i16 (vwrs );
1186
- y [i ].xxxs = GGML_FP32_TO_FP16 (sum * d );
1186
+ y [i ].s = GGML_FP32_TO_FP16 (sum * d );
1187
1187
}
1188
1188
#else
1189
1189
GGML_UNUSED (nb );
@@ -4086,10 +4086,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4086
4086
const block_q8_1 * restrict b_y0 = & vy0 [i ];
4087
4087
const block_q8_1 * restrict b_y1 = & vy1 [i ];
4088
4088
4089
- float32x4_t summs_t = {GGML_FP16_TO_FP32 (b_x0 -> m ) * GGML_FP16_TO_FP32 (b_y0 -> xxxs ),
4090
- GGML_FP16_TO_FP32 (b_x1 -> m ) * GGML_FP16_TO_FP32 (b_y0 -> xxxs ),
4091
- GGML_FP16_TO_FP32 (b_x0 -> m ) * GGML_FP16_TO_FP32 (b_y1 -> xxxs ),
4092
- GGML_FP16_TO_FP32 (b_x1 -> m ) * GGML_FP16_TO_FP32 (b_y1 -> xxxs )};
4089
+ float32x4_t summs_t = {GGML_FP16_TO_FP32 (b_x0 -> m ) * GGML_FP16_TO_FP32 (b_y0 -> s ),
4090
+ GGML_FP16_TO_FP32 (b_x1 -> m ) * GGML_FP16_TO_FP32 (b_y0 -> s ),
4091
+ GGML_FP16_TO_FP32 (b_x0 -> m ) * GGML_FP16_TO_FP32 (b_y1 -> s ),
4092
+ GGML_FP16_TO_FP32 (b_x1 -> m ) * GGML_FP16_TO_FP32 (b_y1 -> s )};
4093
4093
summs0 += summs_t ;
4094
4094
4095
4095
const uint8x16_t m4b = vdupq_n_u8 (0x0F );
@@ -4110,10 +4110,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4110
4110
const int8x16_t y1_h = vld1q_s8 (b_y1 -> qs + 16 );
4111
4111
4112
4112
// mmla into int32x4_t
4113
- float32x4_t scale = {GGML_FP16_TO_FP32 (b_x0 -> d )* GGML_FP16_TO_FP32 (b_y0 -> xxxd ),
4114
- GGML_FP16_TO_FP32 (b_x0 -> d )* GGML_FP16_TO_FP32 (b_y1 -> xxxd ),
4115
- GGML_FP16_TO_FP32 (b_x1 -> d )* GGML_FP16_TO_FP32 (b_y0 -> xxxd ),
4116
- GGML_FP16_TO_FP32 (b_x1 -> d )* GGML_FP16_TO_FP32 (b_y1 -> xxxd )};
4113
+ float32x4_t scale = {GGML_FP16_TO_FP32 (b_x0 -> d )* GGML_FP16_TO_FP32 (b_y0 -> d ),
4114
+ GGML_FP16_TO_FP32 (b_x0 -> d )* GGML_FP16_TO_FP32 (b_y1 -> d ),
4115
+ GGML_FP16_TO_FP32 (b_x1 -> d )* GGML_FP16_TO_FP32 (b_y0 -> d ),
4116
+ GGML_FP16_TO_FP32 (b_x1 -> d )* GGML_FP16_TO_FP32 (b_y1 -> d )};
4117
4117
4118
4118
int8x16_t l0 = vreinterpretq_s8_s64 (vzip1q_s64 (vreinterpretq_s64_s8 (x0_l ), vreinterpretq_s64_s8 (x1_l )));
4119
4119
int8x16_t l1 = vreinterpretq_s8_s64 (vzip2q_s64 (vreinterpretq_s64_s8 (x0_l ), vreinterpretq_s64_s8 (x1_l )));
@@ -4154,7 +4154,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4154
4154
const block_q8_1 * restrict y0 = & y [i + 0 ];
4155
4155
const block_q8_1 * restrict y1 = & y [i + 1 ];
4156
4156
4157
- summs += GGML_FP16_TO_FP32 (x0 -> m ) * GGML_FP16_TO_FP32 (y0 -> xxxs ) + GGML_FP16_TO_FP32 (x1 -> m ) * GGML_FP16_TO_FP32 (y1 -> xxxs );
4157
+ summs += GGML_FP16_TO_FP32 (x0 -> m ) * GGML_FP16_TO_FP32 (y0 -> s ) + GGML_FP16_TO_FP32 (x1 -> m ) * GGML_FP16_TO_FP32 (y1 -> s );
4158
4158
4159
4159
const uint8x16_t m4b = vdupq_n_u8 (0x0F );
4160
4160
@@ -4177,8 +4177,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4177
4177
const int32x4_t p_0 = ggml_vdotq_s32 (ggml_vdotq_s32 (vdupq_n_s32 (0 ), v0_0l , v1_0l ), v0_0h , v1_0h );
4178
4178
const int32x4_t p_1 = ggml_vdotq_s32 (ggml_vdotq_s32 (vdupq_n_s32 (0 ), v0_1l , v1_1l ), v0_1h , v1_1h );
4179
4179
4180
- sumv0 = vmlaq_n_f32 (sumv0 , vcvtq_f32_s32 (p_0 ), GGML_FP16_TO_FP32 (x0 -> d )* GGML_FP16_TO_FP32 (y0 -> xxxd ));
4181
- sumv1 = vmlaq_n_f32 (sumv1 , vcvtq_f32_s32 (p_1 ), GGML_FP16_TO_FP32 (x1 -> d )* GGML_FP16_TO_FP32 (y1 -> xxxd ));
4180
+ sumv0 = vmlaq_n_f32 (sumv0 , vcvtq_f32_s32 (p_0 ), GGML_FP16_TO_FP32 (x0 -> d )* GGML_FP16_TO_FP32 (y0 -> d ));
4181
+ sumv1 = vmlaq_n_f32 (sumv1 , vcvtq_f32_s32 (p_1 ), GGML_FP16_TO_FP32 (x1 -> d )* GGML_FP16_TO_FP32 (y1 -> d ));
4182
4182
}
4183
4183
4184
4184
* s = vaddvq_f32 (sumv0 ) + vaddvq_f32 (sumv1 ) + summs ;
@@ -4191,9 +4191,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4191
4191
// Main loop
4192
4192
for (int i = 0 ; i < nb ; ++ i ) {
4193
4193
const float d0 = GGML_FP16_TO_FP32 (x [i ].d );
4194
- const float d1 = GGML_FP16_TO_FP32 (y [i ].xxxd );
4194
+ const float d1 = GGML_FP16_TO_FP32 (y [i ].d );
4195
4195
4196
- summs += GGML_FP16_TO_FP32 (x [i ].m ) * GGML_FP16_TO_FP32 (y [i ].xxxs );
4196
+ summs += GGML_FP16_TO_FP32 (x [i ].m ) * GGML_FP16_TO_FP32 (y [i ].s );
4197
4197
4198
4198
const __m256 d0v = _mm256_set1_ps ( d0 );
4199
4199
const __m256 d1v = _mm256_set1_ps ( d1 );
@@ -4245,7 +4245,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4245
4245
4246
4246
int sumi = __riscv_vmv_x_s_i32m1_i32 (vs2 );
4247
4247
4248
- sumf += (GGML_FP16_TO_FP32 (x [i ].d )* GGML_FP16_TO_FP32 (y [i ].xxxd ))* sumi + GGML_FP16_TO_FP32 (x [i ].m )* GGML_FP16_TO_FP32 (y [i ].xxxs );
4248
+ sumf += (GGML_FP16_TO_FP32 (x [i ].d )* GGML_FP16_TO_FP32 (y [i ].d ))* sumi + GGML_FP16_TO_FP32 (x [i ].m )* GGML_FP16_TO_FP32 (y [i ].s );
4249
4249
}
4250
4250
4251
4251
* s = sumf ;
@@ -4263,7 +4263,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4263
4263
sumi += (v0 * y [i ].qs [j ]) + (v1 * y [i ].qs [j + qk /2 ]);
4264
4264
}
4265
4265
4266
- sumf += (GGML_FP16_TO_FP32 (x [i ].d )* GGML_FP16_TO_FP32 (y [i ].xxxd ))* sumi + GGML_FP16_TO_FP32 (x [i ].m )* GGML_FP16_TO_FP32 (y [i ].xxxs );
4266
+ sumf += (GGML_FP16_TO_FP32 (x [i ].d )* GGML_FP16_TO_FP32 (y [i ].d ))* sumi + GGML_FP16_TO_FP32 (x [i ].m )* GGML_FP16_TO_FP32 (y [i ].s );
4267
4267
}
4268
4268
4269
4269
* s = sumf ;
@@ -4599,8 +4599,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4599
4599
4600
4600
const uint8x16_t m4b = vdupq_n_u8 (0x0F );
4601
4601
4602
- summs0 += GGML_FP16_TO_FP32 (x0 -> m ) * GGML_FP16_TO_FP32 (y0 -> xxxs );
4603
- summs1 += GGML_FP16_TO_FP32 (x1 -> m ) * GGML_FP16_TO_FP32 (y1 -> xxxs );
4602
+ summs0 += GGML_FP16_TO_FP32 (x0 -> m ) * GGML_FP16_TO_FP32 (y0 -> s );
4603
+ summs1 += GGML_FP16_TO_FP32 (x1 -> m ) * GGML_FP16_TO_FP32 (y1 -> s );
4604
4604
4605
4605
// extract the 5th bit via lookup table ((b) << 4)
4606
4606
memcpy (& qh0 , x0 -> qh , sizeof (qh0 ));
@@ -4644,10 +4644,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4644
4644
4645
4645
sumv0 = vmlaq_n_f32 (sumv0 , vcvtq_f32_s32 (vaddq_s32 (
4646
4646
ggml_vdotq_s32 (vdupq_n_s32 (0 ), v0_0lf , v1_0l ),
4647
- ggml_vdotq_s32 (vdupq_n_s32 (0 ), v0_0hf , v1_0h ))), GGML_FP16_TO_FP32 (x0 -> d )* GGML_FP16_TO_FP32 (y0 -> xxxd ));
4647
+ ggml_vdotq_s32 (vdupq_n_s32 (0 ), v0_0hf , v1_0h ))), GGML_FP16_TO_FP32 (x0 -> d )* GGML_FP16_TO_FP32 (y0 -> d ));
4648
4648
sumv1 = vmlaq_n_f32 (sumv1 , vcvtq_f32_s32 (vaddq_s32 (
4649
4649
ggml_vdotq_s32 (vdupq_n_s32 (0 ), v0_1lf , v1_1l ),
4650
- ggml_vdotq_s32 (vdupq_n_s32 (0 ), v0_1hf , v1_1h ))), GGML_FP16_TO_FP32 (x1 -> d )* GGML_FP16_TO_FP32 (y1 -> xxxd ));
4650
+ ggml_vdotq_s32 (vdupq_n_s32 (0 ), v0_1hf , v1_1h ))), GGML_FP16_TO_FP32 (x1 -> d )* GGML_FP16_TO_FP32 (y1 -> d ));
4651
4651
}
4652
4652
4653
4653
* s = vaddvq_f32 (sumv0 ) + vaddvq_f32 (sumv1 ) + summs0 + summs1 ;
@@ -4664,7 +4664,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4664
4664
const block_q5_1 * restrict x0 = & x [i ];
4665
4665
const block_q8_1 * restrict y0 = & y [i ];
4666
4666
4667
- summs += GGML_FP16_TO_FP32 (x0 -> m ) * GGML_FP16_TO_FP32 (y0 -> xxxs );
4667
+ summs += GGML_FP16_TO_FP32 (x0 -> m ) * GGML_FP16_TO_FP32 (y0 -> s );
4668
4668
4669
4669
const v128_t m4b = wasm_i8x16_splat (0x0F );
4670
4670
@@ -4711,7 +4711,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4711
4711
wasm_i32x4_dot_i16x8 (v0lfh , v1lh )),
4712
4712
wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0hfl , v1hl ),
4713
4713
wasm_i32x4_dot_i16x8 (v0hfh , v1hh )))),
4714
- wasm_f32x4_splat (GGML_FP16_TO_FP32 (x0 -> d ) * GGML_FP16_TO_FP32 (y0 -> xxxd ))));
4714
+ wasm_f32x4_splat (GGML_FP16_TO_FP32 (x0 -> d ) * GGML_FP16_TO_FP32 (y0 -> d ))));
4715
4715
}
4716
4716
4717
4717
* s = wasm_f32x4_extract_lane (sumv , 0 ) + wasm_f32x4_extract_lane (sumv , 1 ) +
@@ -4726,14 +4726,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4726
4726
for (int i = 0 ; i < nb ; i ++ ) {
4727
4727
const __m256 dx = _mm256_set1_ps (GGML_FP16_TO_FP32 (x [i ].d ));
4728
4728
4729
- summs += GGML_FP16_TO_FP32 (x [i ].m ) * GGML_FP16_TO_FP32 (y [i ].xxxs );
4729
+ summs += GGML_FP16_TO_FP32 (x [i ].m ) * GGML_FP16_TO_FP32 (y [i ].s );
4730
4730
4731
4731
__m256i qx = bytes_from_nibbles_32 (x [i ].qs );
4732
4732
__m256i bxhi = bytes_from_bits_32 (x [i ].qh );
4733
4733
bxhi = _mm256_and_si256 (bxhi , _mm256_set1_epi8 (0x10 ));
4734
4734
qx = _mm256_or_si256 (qx , bxhi );
4735
4735
4736
- const __m256 dy = _mm256_set1_ps (GGML_FP16_TO_FP32 (y [i ].xxxd ));
4736
+ const __m256 dy = _mm256_set1_ps (GGML_FP16_TO_FP32 (y [i ].d ));
4737
4737
const __m256i qy = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
4738
4738
4739
4739
const __m256 q = mul_sum_us8_pairs_float (qx , qy );
@@ -4753,7 +4753,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4753
4753
for (int i = 0 ; i < nb ; i ++ ) {
4754
4754
const __m256 dx = _mm256_set1_ps (GGML_FP16_TO_FP32 (x [i ].d ));
4755
4755
4756
- summs += GGML_FP16_TO_FP32 (x [i ].m ) * GGML_FP16_TO_FP32 (y [i ].xxxs );
4756
+ summs += GGML_FP16_TO_FP32 (x [i ].m ) * GGML_FP16_TO_FP32 (y [i ].s );
4757
4757
4758
4758
__m256i bx_0 = bytes_from_nibbles_32 (x [i ].qs );
4759
4759
const __m256i bxhi = bytes_from_bits_32 (x [i ].qh );
@@ -4767,7 +4767,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4767
4767
bxh = _mm_or_si128 (bxh , bxhih );
4768
4768
bx_0 = MM256_SET_M128I (bxh , bxl );
4769
4769
4770
- const __m256 dy = _mm256_set1_ps (GGML_FP16_TO_FP32 (y [i ].xxxd ));
4770
+ const __m256 dy = _mm256_set1_ps (GGML_FP16_TO_FP32 (y [i ].d ));
4771
4771
const __m256i by_0 = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
4772
4772
4773
4773
const __m256 q = mul_sum_us8_pairs_float (bx_0 , by_0 );
@@ -4834,7 +4834,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4834
4834
4835
4835
int sumi = __riscv_vmv_x_s_i32m1_i32 (vs2 );
4836
4836
4837
- sumf += (GGML_FP16_TO_FP32 (x [i ].d )* GGML_FP16_TO_FP32 (y [i ].xxxd ))* sumi + GGML_FP16_TO_FP32 (x [i ].m )* GGML_FP16_TO_FP32 (y [i ].xxxs );
4837
+ sumf += (GGML_FP16_TO_FP32 (x [i ].d )* GGML_FP16_TO_FP32 (y [i ].d ))* sumi + GGML_FP16_TO_FP32 (x [i ].m )* GGML_FP16_TO_FP32 (y [i ].s );
4838
4838
}
4839
4839
4840
4840
* s = sumf ;
@@ -4858,7 +4858,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4858
4858
sumi += (x0 * y [i ].qs [j ]) + (x1 * y [i ].qs [j + qk /2 ]);
4859
4859
}
4860
4860
4861
- sumf += (GGML_FP16_TO_FP32 (x [i ].d )* GGML_FP16_TO_FP32 (y [i ].xxxd ))* sumi + GGML_FP16_TO_FP32 (x [i ].m )* GGML_FP16_TO_FP32 (y [i ].xxxs );
4861
+ sumf += (GGML_FP16_TO_FP32 (x [i ].d )* GGML_FP16_TO_FP32 (y [i ].d ))* sumi + GGML_FP16_TO_FP32 (x [i ].m )* GGML_FP16_TO_FP32 (y [i ].s );
4862
4862
}
4863
4863
4864
4864
* s = sumf ;
0 commit comments