@@ -3326,7 +3326,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
3326
3326
y[i].d = GGML_FP32_TO_FP16(d);
3327
3327
3328
3328
// 5 elements per byte, along 32 bytes
3329
- for (size_t j = 0; j < sizeof(y->q ) - sizeof(y->q ) % 32; j += 32) {
3329
+ for (size_t j = 0; j < sizeof(y->qs ) - sizeof(y->qs ) % 32; j += 32) {
3330
3330
for (size_t m = 0; m < 32; ++m) {
3331
3331
uint8_t q = 0;
3332
3332
for (size_t n = 0; n < 5; ++n) {
@@ -3336,12 +3336,12 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
3336
3336
}
3337
3337
// ceiling division (243 == pow(3, 5))
3338
3338
q = ((uint16_t)q * 256 + (243 - 1)) / 243;
3339
- y[i].q [j + m] = q;
3339
+ y[i].qs [j + m] = q;
3340
3340
}
3341
3341
x += 5*32;
3342
3342
}
3343
3343
// along 16 bytes
3344
- for (size_t j = sizeof(y->q ) - sizeof(y->q ) % 32; j < sizeof(y->q ); j += 16) {
3344
+ for (size_t j = sizeof(y->qs ) - sizeof(y->qs ) % 32; j < sizeof(y->qs ); j += 16) {
3345
3345
for (size_t m = 0; m < 16; ++m) {
3346
3346
uint8_t q = 0;
3347
3347
for (size_t n = 0; n < 5; ++n) {
@@ -3351,26 +3351,26 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
3351
3351
}
3352
3352
// ceiling division (243 == pow(3, 5))
3353
3353
q = ((uint16_t)q * 256 + (243 - 1)) / 243;
3354
- y[i].q [j + m] = q;
3354
+ y[i].qs [j + m] = q;
3355
3355
}
3356
3356
x += 5*16;
3357
3357
}
3358
3358
// 4 elements per byte
3359
- for (size_t j = 0; j < sizeof(y->qs ); ++j) {
3359
+ for (size_t j = 0; j < sizeof(y->qh ); ++j) {
3360
3360
uint8_t q = 0;
3361
3361
for (size_t m = 0; m < 4; ++m) {
3362
3362
// -1, 0, 1 -> 0, 1, 2
3363
- int xi = nearest_int(x[j + m*sizeof(y->qs )] * id) + 1;
3363
+ int xi = nearest_int(x[j + m*sizeof(y->qh )] * id) + 1;
3364
3364
q *= 3;
3365
3365
q += xi;
3366
3366
}
3367
3367
// shift the first value to the most significant trit
3368
3368
q *= 3;
3369
3369
// ceiling division (243 == pow(3, 5))
3370
3370
q = ((uint16_t)q * 256 + (243 - 1)) / 243;
3371
- y[i].qs [j] = q;
3371
+ y[i].qh [j] = q;
3372
3372
}
3373
- x += 4*sizeof(y->qs );
3373
+ x += 4*sizeof(y->qh );
3374
3374
}
3375
3375
}
3376
3376
@@ -3392,15 +3392,15 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
3392
3392
y[i].d = GGML_FP32_TO_FP16(d);
3393
3393
3394
3394
// TODO: should it be along 64 bytes instead for AVX512?
3395
- for (size_t j = 0; j < sizeof(y->q ); j += 32) {
3395
+ for (size_t j = 0; j < sizeof(y->qs ); j += 32) {
3396
3396
for (size_t m = 0; m < 32; ++m) {
3397
3397
uint8_t q = 0;
3398
3398
for (size_t n = 0; n < 4; ++n) {
3399
3399
// -1, 0, 1 -> 0, 1, 2
3400
3400
int xi = nearest_int(x[m + n*32] * id) + 1;
3401
3401
q += (xi & 3) << (2*n);
3402
3402
}
3403
- y[i].q [j + m] = q;
3403
+ y[i].qs [j + m] = q;
3404
3404
}
3405
3405
x += 4*32;
3406
3406
}
@@ -3444,28 +3444,28 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in
3444
3444
3445
3445
const float d = GGML_FP16_TO_FP32(x[i].d);
3446
3446
3447
- for (size_t j = 0; j < sizeof(x->q ) - sizeof(x->q ) % 32; j += 32) {
3447
+ for (size_t j = 0; j < sizeof(x->qs ) - sizeof(x->qs ) % 32; j += 32) {
3448
3448
for (size_t n = 0; n < 5; ++n) {
3449
3449
for (size_t m = 0; m < 32; ++m) {
3450
- uint8_t q = x[i].q [j + m] * pow3[n];
3450
+ uint8_t q = x[i].qs [j + m] * pow3[n];
3451
3451
int16_t xi = ((uint16_t) q * 3) >> 8;
3452
3452
*y++ = (float) (xi - 1) * d;
3453
3453
}
3454
3454
}
3455
3455
}
3456
- for (size_t j = sizeof(x->q ) - sizeof(x->q ) % 32; j < sizeof(x->q ); j += 16) {
3456
+ for (size_t j = sizeof(x->qs ) - sizeof(x->qs ) % 32; j < sizeof(x->qs ); j += 16) {
3457
3457
for (size_t n = 0; n < 5; ++n) {
3458
3458
for (size_t m = 0; m < 16; ++m) {
3459
- uint8_t q = x[i].q [j + m] * pow3[n];
3459
+ uint8_t q = x[i].qs [j + m] * pow3[n];
3460
3460
int16_t xi = ((uint16_t) q * 3) >> 8;
3461
3461
*y++ = (float) (xi - 1) * d;
3462
3462
}
3463
3463
}
3464
3464
}
3465
3465
3466
3466
for (size_t n = 0; n < 4; ++n) {
3467
- for (size_t j = 0; j < sizeof(x->qs ); ++j) {
3468
- uint8_t q = x[i].qs [j] * pow3[n];
3467
+ for (size_t j = 0; j < sizeof(x->qh ); ++j) {
3468
+ uint8_t q = x[i].qh [j] * pow3[n];
3469
3469
int16_t xi = ((uint16_t) q * 3) >> 8;
3470
3470
*y++ = (float) (xi - 1) * d;
3471
3471
}
@@ -3481,10 +3481,10 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in
3481
3481
3482
3482
const float d = GGML_FP16_TO_FP32(x[i].d);
3483
3483
3484
- for (size_t j = 0; j < sizeof(x->q ); j += 32) {
3484
+ for (size_t j = 0; j < sizeof(x->qs ); j += 32) {
3485
3485
for (size_t l = 0; l < 4; ++l) {
3486
3486
for (size_t m = 0; m < 32; ++m) {
3487
- int8_t q = (x[i].q [j + m] >> (l*2)) & 3;
3487
+ int8_t q = (x[i].qs [j + m] >> (l*2)) & 3;
3488
3488
*y++ = (float) (q - 1) * d;
3489
3489
}
3490
3490
}
@@ -5681,8 +5681,8 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
5681
5681
5682
5682
// first 32 bytes of 5 elements
5683
5683
{
5684
- uint8x16_t qx0 = vld1q_u8(x[i].q + 0);
5685
- uint8x16_t qx1 = vld1q_u8(x[i].q + 16);
5684
+ uint8x16_t qx0 = vld1q_u8(x[i].qs + 0);
5685
+ uint8x16_t qx1 = vld1q_u8(x[i].qs + 16);
5686
5686
uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3));
5687
5687
uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3));
5688
5688
uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9));
@@ -5739,14 +5739,14 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
5739
5739
5740
5740
// last 16 bytes of 5-element, along with the 4 bytes of 4 elements
5741
5741
{
5742
- uint8x16_t qx0 = vld1q_u8(x[i].q + 32);
5742
+ uint8x16_t qx0 = vld1q_u8(x[i].qs + 32);
5743
5743
uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3));
5744
5744
uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9));
5745
5745
uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27));
5746
5746
uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81));
5747
- uint32_t qs ;
5748
- memcpy(&qs , x[i].qs , sizeof(qs )); // potentially unaligned
5749
- uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qs ));
5747
+ uint32_t qh ;
5748
+ memcpy(&qh , x[i].qh , sizeof(qh )); // potentially unaligned
5749
+ uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh ));
5750
5750
qx5 = vmulq_u8(qx5, shift);
5751
5751
5752
5752
// multiply by 3 and keep the 2 bits above 8 bits
@@ -5802,7 +5802,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
5802
5802
5803
5803
// first 32 bytes of 5 elements
5804
5804
{
5805
- __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].q ));
5805
+ __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs ));
5806
5806
// 8-bit multiplies with shifts, masks and adds
5807
5807
__m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
5808
5808
__m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
@@ -5848,10 +5848,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
5848
5848
5849
5849
// last 16 bytes of 5-element, along with the 4 bytes of 4 elements
5850
5850
{
5851
- __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].q + 32));
5852
- uint32_t qs ;
5853
- memcpy(&qs , x[i].qs , sizeof(qs )); // potentially unaligned
5854
- __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qs ));
5851
+ __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
5852
+ uint32_t qh ;
5853
+ memcpy(&qh , x[i].qh , sizeof(qh )); // potentially unaligned
5854
+ __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh ));
5855
5855
__m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
5856
5856
__m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
5857
5857
__m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
@@ -5911,30 +5911,30 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
5911
5911
for (int i = 0; i < nb; ++i) {
5912
5912
int sum = 0;
5913
5913
5914
- for (size_t j = 0; j < sizeof(x->q ) - sizeof(x->q ) % 32; j += 32) {
5914
+ for (size_t j = 0; j < sizeof(x->qs ) - sizeof(x->qs ) % 32; j += 32) {
5915
5915
for (size_t l = 0; l < 5; ++l) {
5916
5916
for (size_t m = 0; m < 32; ++m) {
5917
- uint8_t q = x[i].q [j + m] * pow3[l];
5917
+ uint8_t q = x[i].qs [j + m] * pow3[l];
5918
5918
uint16_t xi = ((uint16_t) q * 3) >> 8;
5919
5919
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
5920
5920
}
5921
5921
}
5922
5922
}
5923
- for (size_t j = sizeof(x->q ) - sizeof(x->q ) % 32; j < sizeof(x->q ); j += 16) {
5923
+ for (size_t j = sizeof(x->qs ) - sizeof(x->qs ) % 32; j < sizeof(x->qs ); j += 16) {
5924
5924
for (size_t l = 0; l < 5; ++l) {
5925
5925
for (size_t m = 0; m < 16; ++m) {
5926
- uint8_t q = x[i].q [j + m] * pow3[l];
5926
+ uint8_t q = x[i].qs [j + m] * pow3[l];
5927
5927
uint16_t xi = ((uint16_t) q * 3) >> 8;
5928
5928
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
5929
5929
}
5930
5930
}
5931
5931
}
5932
5932
5933
5933
for (size_t l = 0; l < 4; ++l) {
5934
- for (size_t j = 0; j < sizeof(x->qs ); ++j) {
5935
- uint8_t q = x[i].qs [j] * pow3[l];
5934
+ for (size_t j = 0; j < sizeof(x->qh ); ++j) {
5935
+ uint8_t q = x[i].qh [j] * pow3[l];
5936
5936
uint16_t xi = ((uint16_t) q * 3) >> 8;
5937
- sum += (xi - 1) * y[i].qs[sizeof(x->q )*5 + l*sizeof(x->qs ) + j];
5937
+ sum += (xi - 1) * y[i].qs[sizeof(x->qs )*5 + l*sizeof(x->qh ) + j];
5938
5938
}
5939
5939
}
5940
5940
@@ -5966,9 +5966,9 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
5966
5966
int16x8_t sumi0 = vdupq_n_s16(0);
5967
5967
int16x8_t sumi1 = vdupq_n_s16(0);
5968
5968
5969
- for (size_t j = 0; j < sizeof(x->q ); j += 32) {
5970
- uint8x16_t qx0 = vld1q_u8(x[i].q + j);
5971
- uint8x16_t qx1 = vld1q_u8(x[i].q + j + 16);
5969
+ for (size_t j = 0; j < sizeof(x->qs ); j += 32) {
5970
+ uint8x16_t qx0 = vld1q_u8(x[i].qs + j);
5971
+ uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16);
5972
5972
uint8x16_t qx2 = vshrq_n_u8(qx0, 2);
5973
5973
uint8x16_t qx3 = vshrq_n_u8(qx1, 2);
5974
5974
uint8x16_t qx4 = vshrq_n_u8(qx0, 4);
@@ -6033,8 +6033,8 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
6033
6033
__m256i sumi0 = _mm256_setzero_si256();
6034
6034
__m256i sumi1 = _mm256_setzero_si256();
6035
6035
6036
- for (size_t j = 0; j < sizeof(x->q ); j += 32) {
6037
- __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].q + j));
6036
+ for (size_t j = 0; j < sizeof(x->qs ); j += 32) {
6037
+ __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
6038
6038
__m256i qx1 = _mm256_srli_epi16(qx0, 2);
6039
6039
__m256i qx2 = _mm256_srli_epi16(qx0, 4);
6040
6040
__m256i qx3 = _mm256_srli_epi16(qx0, 6);
@@ -6077,10 +6077,10 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
6077
6077
for (int i = 0; i < nb; ++i) {
6078
6078
int32_t sumi = 0;
6079
6079
6080
- for (size_t j = 0; j < sizeof(x->q ); j += 32) {
6080
+ for (size_t j = 0; j < sizeof(x->qs ); j += 32) {
6081
6081
for (size_t l = 0; l < 4; ++l) {
6082
6082
for (size_t k = 0; k < 32; ++k) {
6083
- sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].q [j + k] >> (l*2)) & 3) - 1);
6083
+ sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs [j + k] >> (l*2)) & 3) - 1);
6084
6084
}
6085
6085
}
6086
6086
}
0 commit comments