Skip to content

Commit d4c3452

Browse files
committed
ggml-quants : cleanup Q1_3 code formatting
1 parent c6d9b11 commit d4c3452

File tree

1 file changed

+15
-54
lines changed

1 file changed

+15
-54
lines changed

ggml-quants.c

Lines changed: 15 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -3428,48 +3428,6 @@ void dequantize_row_q1_3(const block_q1_3 * restrict x, float * restrict y, int6
34283428
const int64_t nb = k / QK1_3;
34293429
static_assert(sizeof(x->q) % 4 == 0, "bad block_q1_3.q size");
34303430

3431-
// #if defined(__SSE2__)
3432-
// __m128 vscale = _mm_set1_ps(scale);
3433-
3434-
// for (int64_t i = 0; i < nb; ++i) {
3435-
// for (size_t j = 0; j < sizeof(x->q); j += 4) {
3436-
// __m128 q1 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 0]]));
3437-
// __m128 q2 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 1]]));
3438-
// __m128 q3 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 2]]));
3439-
// __m128 q4 = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].q[j + 3]]));
3440-
// q1 = _mm_mul_ps(q1, vscale);
3441-
// q2 = _mm_mul_ps(q2, vscale);
3442-
// q3 = _mm_mul_ps(q3, vscale);
3443-
// q4 = _mm_mul_ps(q4, vscale);
3444-
3445-
// _mm_store_ps(y + 0, q1);
3446-
// _mm_store_ps(y + 4, q2);
3447-
// _mm_store_ps(y + 8, q3);
3448-
// _mm_store_ps(y + 12, q4);
3449-
// y += 16;
3450-
// }
3451-
3452-
// for (size_t j = 0; j < sizeof(x->q); j += 4) {
3453-
// __m128i q5i = _mm_loadu_si32(x[i].q + j);
3454-
// q5i = _mm_cvtepi8_epi16(q5i);
3455-
// q5i = _mm_add_epi16(q5i, _mm_add_epi16(q5i, q5i));
3456-
// q5i = _mm_srli_epi16(q5i, 8);
3457-
// q5i = _mm_sub_epi16(q5i, _mm_set1_epi16(1));
3458-
// __m128 q5 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(q5i));
3459-
// q5 = _mm_mul_ps(q5, vscale);
3460-
3461-
// _mm_store_ps(y, q5);
3462-
// y += 4;
3463-
// }
3464-
3465-
// for (size_t j = 0; j < sizeof(x->qs); ++j) {
3466-
// __m128 q = _mm_cvtpi8_ps(_m_from_int(q1_3_grid[x[i].qs[j]]));
3467-
// q = _mm_mul_ps(q, vscale);
3468-
// _mm_store_ps(y, q);
3469-
// y += 4;
3470-
// }
3471-
// }
3472-
// #else
34733431
for (int64_t i = 0; i < nb; ++i) {
34743432
for (size_t j = 0; j < sizeof(x->q); ++j) {
34753433
const int8_t * q = (const int8_t *) (q1_3_grid + x[i].q[j]);
@@ -3490,7 +3448,6 @@ void dequantize_row_q1_3(const block_q1_3 * restrict x, float * restrict y, int6
34903448
}
34913449
}
34923450
}
3493-
// #endif
34943451
}
34953452

34963453
// ====================== "True" 2-bit (de)-quantization
@@ -10945,14 +10902,15 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
1094510902
__m256 accumf = _mm256_setzero_ps();
1094610903

1094710904
for (int i = 0; i < nb; ++i) {
10948-
// const __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1));
10949-
// const __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12);
10905+
// const __m128i x12a = _mm_maskload_epi32((const int32_t *) x, _mm_set_epi32(0, -1, -1, -1));
10906+
// const __m128i x12b = _mm_insert_epi8(x12a, x->qs[0], 12);
1095010907
// WARNING: reading 3 bytes further than necessary.
1095110908
// It's measurably faster than a masked load on an Intel Core m3-8100Y
10952-
const __m128i x12b = _mm_loadu_si128((const __m128i_u *) (x[i].q));
10909+
const __m128i x12b = _mm_loadu_si128((const __m128i_u *) x);
1095310910
const __m256i x12 = MM256_SET_M128I(x12b, x12b);
1095410911

1095510912
{
10913+
// pre-shift the values by 8 bits, and prepare the layout for later packing
1095610914
__m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1,
1095710915
4, -1, 4, -1, 4, -1, 4, -1,
1095810916
1, -1, 1, -1, 1, -1, 1, -1,
@@ -10973,8 +10931,8 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
1097310931
3, 9, 27, 81,
1097410932
3, 9, 27, 81,
1097510933
3, 9, 27, 81);
10976-
const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1,
10977-
1, 1, 1, 1,
10934+
const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1,
10935+
1, 1, 1, 1,
1097810936
3, 9, 27, 81,
1097910937
3, 9, 27, 81);
1098010938
const __m256i shift1h = _mm256_set_epi16(3, 9, 27, 81,
@@ -10998,18 +10956,21 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
1099810956
x0 = _mm256_sub_epi8(x0, _mm256_set1_epi8(1));
1099910957
x1 = _mm256_sub_epi8(x1, _mm256_set1_epi8(1));
1100010958

11001-
const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 0].qs));
11002-
const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 1].qs));
10959+
const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) (y[0].qs));
10960+
const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) (y[1].qs));
1100310961

11004-
const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 0].d));
11005-
const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
10962+
const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d));
10963+
const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d));
1100610964

11007-
const __m256 q0 = mul_sum_i8_pairs_float(x0, y0);
11008-
const __m256 q1 = mul_sum_i8_pairs_float(x1, y1);
10965+
const __m256 q0 = mul_sum_i8_pairs_float(y0, x0);
10966+
const __m256 q1 = mul_sum_i8_pairs_float(y1, x1);
1100910967

1101010968
accumf = _mm256_fmadd_ps(d0, q0, accumf);
1101110969
accumf = _mm256_fmadd_ps(d1, q1, accumf);
1101210970
}
10971+
10972+
x += 1;
10973+
y += 2;
1101310974
}
1101410975

1101510976
*s = hsum_float_8(accumf);

0 commit comments

Comments
 (0)