@@ -3424,48 +3424,6 @@ void dequantize_row_q1_3(const block_q1_3 * restrict x, float * restrict y, int6
3424
3424
const int64_t nb = k / QK1_3;
3425
3425
static_assert(sizeof(x->q) % 4 == 0, "bad block_q1_3.q size");
3426
3426
3427
- // #if defined(__SSE2__)
3428
- // __m128 vscale = _mm_set1_ps(scale);
3429
-
3430
- // for (int64_t i = 0; i < nb; ++i) {
3431
- // for (size_t j = 0; j < sizeof(x->q); j += 4) {
3432
- // __m128 q1 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 0]]));
3433
- // __m128 q2 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 1]]));
3434
- // __m128 q3 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 2]]));
3435
- // __m128 q4 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 3]]));
3436
- // q1 = _mm_mul_ps(q1, vscale);
3437
- // q2 = _mm_mul_ps(q2, vscale);
3438
- // q3 = _mm_mul_ps(q3, vscale);
3439
- // q4 = _mm_mul_ps(q4, vscale);
3440
-
3441
- // _mm_store_ps(y + 0, q1);
3442
- // _mm_store_ps(y + 4, q2);
3443
- // _mm_store_ps(y + 8, q3);
3444
- // _mm_store_ps(y + 12, q4);
3445
- // y += 16;
3446
- // }
3447
-
3448
- // for (size_t j = 0; j < sizeof(x->q); j += 4) {
3449
- // __m128i q5i = _mm_loadu_si32(x[i].q + j);
3450
- // q5i = _mm_cvtepi8_epi16(q5i);
3451
- // q5i = _mm_add_epi16(q5i, _mm_add_epi16(q5i, q5i));
3452
- // q5i = _mm_srli_epi16(q5i, 8);
3453
- // q5i = _mm_sub_epi16(q5i, _mm_set1_epi16(1));
3454
- // __m128 q5 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(q5i));
3455
- // q5 = _mm_mul_ps(q5, vscale);
3456
-
3457
- // _mm_store_ps(y, q5);
3458
- // y += 4;
3459
- // }
3460
-
3461
- // for (size_t j = 0; j < sizeof(x->qs); ++j) {
3462
- // __m128 q = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].qs[j]]));
3463
- // q = _mm_mul_ps(q, vscale);
3464
- // _mm_store_ps(y, q);
3465
- // y += 4;
3466
- // }
3467
- // }
3468
- // #else
3469
3427
for (int64_t i = 0; i < nb; ++i) {
3470
3428
for (size_t j = 0; j < sizeof(x->q); ++j) {
3471
3429
const int8_t * q = (const int8_t *) (q1_3_grid + x[i].q[j]);
@@ -3486,7 +3444,6 @@ void dequantize_row_q1_3(const block_q1_3 * restrict x, float * restrict y, int6
3486
3444
}
3487
3445
}
3488
3446
}
3489
- // #endif
3490
3447
}
3491
3448
3492
3449
// ====================== "True" 2-bit (de)-quantization
@@ -11356,14 +11313,15 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11356
11313
__m256 accumf = _mm256_setzero_ps();
11357
11314
11358
11315
for (int i = 0; i < nb; ++i) {
11359
- // const __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q , _mm_set_epi32(0, -1, -1, -1));
11360
- // const __m128i x12b = _mm_insert_epi8(x12a, x[i]. qs[0], 12);
11316
+ // const __m128i x12a = _mm_maskload_epi32((const int32_t *) x, _mm_set_epi32(0, -1, -1, -1));
11317
+ // const __m128i x12b = _mm_insert_epi8(x12a, x-> qs[0], 12);
11361
11318
// WARNING: reading 3 bytes further than necessary.
11362
11319
// It's measurably faster than a masked load on an Intel Core m3-8100Y
11363
- const __m128i x12b = _mm_loadu_si128((const __m128i_u *) (x[i].q) );
11320
+ const __m128i x12b = _mm_loadu_si128((const __m128i_u *) x );
11364
11321
const __m256i x12 = MM256_SET_M128I(x12b, x12b);
11365
11322
11366
11323
{
11324
+ // pre-shift the values by 8 bits, and prepare the layout for later packing
11367
11325
__m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1,
11368
11326
4, -1, 4, -1, 4, -1, 4, -1,
11369
11327
1, -1, 1, -1, 1, -1, 1, -1,
@@ -11384,8 +11342,8 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11384
11342
3, 9, 27, 81,
11385
11343
3, 9, 27, 81,
11386
11344
3, 9, 27, 81);
11387
- const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1,
11388
- 1, 1, 1, 1,
11345
+ const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1,
11346
+ 1, 1, 1, 1,
11389
11347
3, 9, 27, 81,
11390
11348
3, 9, 27, 81);
11391
11349
const __m256i shift1h = _mm256_set_epi16(3, 9, 27, 81,
@@ -11409,18 +11367,21 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11409
11367
x0 = _mm256_sub_epi8(x0, _mm256_set1_epi8(1));
11410
11368
x1 = _mm256_sub_epi8(x1, _mm256_set1_epi8(1));
11411
11369
11412
- const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 0].qs));
11413
- const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 1].qs));
11370
+ const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) (y[0].qs));
11371
+ const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) (y[1].qs));
11414
11372
11415
- const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 0].d));
11416
- const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
11373
+ const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d));
11374
+ const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d));
11417
11375
11418
- const __m256 q0 = mul_sum_i8_pairs_float(x0, y0 );
11419
- const __m256 q1 = mul_sum_i8_pairs_float(x1, y1 );
11376
+ const __m256 q0 = mul_sum_i8_pairs_float(y0, x0 );
11377
+ const __m256 q1 = mul_sum_i8_pairs_float(y1, x1 );
11420
11378
11421
11379
accumf = _mm256_fmadd_ps(d0, q0, accumf);
11422
11380
accumf = _mm256_fmadd_ps(d1, q1, accumf);
11423
11381
}
11382
+
11383
+ x += 1;
11384
+ y += 2;
11424
11385
}
11425
11386
11426
11387
*s = hsum_float_8(accumf);
0 commit comments