Skip to content

Commit d6fd678

Browse files
committed
wip : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed. * gguf-py : fix formatting * llama : remove spaces on empty line
1 parent ea049b0 commit d6fd678

File tree

3 files changed

+59
-32
lines changed

3 files changed

+59
-32
lines changed

ggml-quants.c

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -11009,40 +11009,68 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
1100911009
__m256 accumf = _mm256_setzero_ps();
1101011010

1101111011
for (int i = 0; i < nb; ++i) {
11012-
{
11013-
__m256i x0 = _mm256_set_epi32(q1_3_grid[x[i].q[7]], q1_3_grid[x[i].q[6]],
11014-
q1_3_grid[x[i].q[5]], q1_3_grid[x[i].q[4]],
11015-
q1_3_grid[x[i].q[3]], q1_3_grid[x[i].q[2]],
11016-
q1_3_grid[x[i].q[1]], q1_3_grid[x[i].q[0]]);
11017-
__m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i].qs));
11018-
11019-
__m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i].d));
11020-
11021-
__m256 q = mul_sum_i8_pairs_float(x0, y0);
11022-
11023-
accumf = _mm256_fmadd_ps(d, q, accumf);
11024-
}
11012+
// __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1));
11013+
// __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12);
11014+
// WARNING: reading 3 bytes further than necessary. It's faster than the above on my CPU, though.
11015+
__m128i x12b = _mm_loadu_si128((const __m128i_u *) x[i].q);
11016+
__m256i x12 = MM256_SET_M128I(x12b, x12b);
1102511017

1102611018
{
11027-
__m256i x1 = _mm256_castsi128_si256(_mm_set_epi32(q1_3_grid[x[i].q[11]], q1_3_grid[x[i].q[10]],
11028-
q1_3_grid[x[i].q[9]], q1_3_grid[x[i].q[8]]));
11029-
__m256i x2 = _mm256_cvtepu8_epi16(_mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1)));
11019+
__m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1,
11020+
4, -1, 4, -1, 4, -1, 4, -1,
11021+
1, -1, 1, -1, 1, -1, 1, -1,
11022+
0, -1, 0, -1, 0, -1, 0, -1));
11023+
__m256i x0h = _mm256_shuffle_epi8(x12, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1,
11024+
6, -1, 6, -1, 6, -1, 6, -1,
11025+
3, -1, 3, -1, 3, -1, 3, -1,
11026+
2, -1, 2, -1, 2, -1, 2, -1));
11027+
__m256i x1l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(7, -1, 6, -1, 5, -1, 4, -1,
11028+
3, -1, 2, -1, 1, -1, 0, -1,
11029+
9, -1, 9, -1, 9, -1, 9, -1,
11030+
8, -1, 8, -1, 8, -1, 8, -1));
11031+
__m256i x1h = _mm256_shuffle_epi8(x12, _mm256_set_epi8(12, -1, 12, -1, 12, -1, 12, -1,
11032+
11, -1, 10, -1, 9, -1, 8, -1,
11033+
11, -1, 11, -1, 11, -1, 11, -1,
11034+
10, -1, 10, -1, 10, -1, 10, -1));
11035+
const __m256i shift0 = _mm256_set_epi16(3, 9, 27, 81,
11036+
3, 9, 27, 81,
11037+
3, 9, 27, 81,
11038+
3, 9, 27, 81);
11039+
const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1,
11040+
1, 1, 1, 1,
11041+
3, 9, 27, 81,
11042+
3, 9, 27, 81);
11043+
const __m256i shift1h = _mm256_set_epi16(3, 9, 27, 81,
11044+
1, 1, 1, 1,
11045+
3, 9, 27, 81,
11046+
3, 9, 27, 81);
11047+
x0l = _mm256_mullo_epi16(x0l, shift0);
11048+
x0h = _mm256_mullo_epi16(x0h, shift0);
11049+
x1l = _mm256_mullo_epi16(x1l, shift1l);
11050+
x1h = _mm256_mullo_epi16(x1h, shift1h);
11051+
x0l = _mm256_mulhi_epu16(x0l, _mm256_set1_epi16(3));
11052+
x0h = _mm256_mulhi_epu16(x0h, _mm256_set1_epi16(3));
11053+
x1l = _mm256_mulhi_epu16(x1l, _mm256_set1_epi16(3));
11054+
x1h = _mm256_mulhi_epu16(x1h, _mm256_set1_epi16(3));
11055+
x0l = _mm256_sub_epi16(x0l, _mm256_set1_epi16(1));
11056+
x0h = _mm256_sub_epi16(x0h, _mm256_set1_epi16(1));
11057+
x1l = _mm256_sub_epi16(x1l, _mm256_set1_epi16(1));
11058+
x1h = _mm256_sub_epi16(x1h, _mm256_set1_epi16(1));
11059+
11060+
__m256i x0 = _mm256_packs_epi16(x0l, x0h);
11061+
__m256i x1 = _mm256_packs_epi16(x1l, x1h);
11062+
11063+
__m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 0].qs));
1103011064
__m256i y1 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 1].qs));
1103111065

11032-
x2 = _mm256_mulhi_epu16(x2, _mm256_set1_epi16(3 << 8));
11033-
x2 = _mm256_sub_epi16(x2, _mm256_set1_epi16(1));
11034-
11035-
// TODO: reduce shuffling
11036-
x2 = _mm256_packs_epi16(x2, _mm256_setzero_si256());
11037-
x2 = _mm256_permute4x64_epi64(x2, _MM_SHUFFLE(3, 1, 2, 0));
11038-
__m128i x2_l = _mm_insert_epi32(_mm256_castsi256_si128(x2), q1_3_grid[x[i].qs[0]], 3);
11039-
x1 = _mm256_inserti128_si256(x1, x2_l, 1);
11040-
11041-
__m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
11066+
__m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i].d));
11067+
__m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
1104211068

11043-
__m256 q = mul_sum_i8_pairs_float(x1, y1);
11069+
__m256 q0 = mul_sum_i8_pairs_float(x0, y0);
11070+
__m256 q1 = mul_sum_i8_pairs_float(x1, y1);
1104411071

11045-
accumf = _mm256_fmadd_ps(d, q, accumf);
11072+
accumf = _mm256_fmadd_ps(d0, q0, accumf);
11073+
accumf = _mm256_fmadd_ps(d1, q1, accumf);
1104611074
}
1104711075
}
1104811076

gguf-py/gguf/quants.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,9 @@ def __quantize_q1_3_rows(n: np.ndarray) -> np.ndarray:
148148
q48 = np.sum(q48 * pow3.reshape((1, 1, 4)), axis=2, keepdims=True).reshape((n_blocks, 12))
149149
q4 = np.sum(q4 * pow3.reshape((1, 4)), axis=1, keepdims=True)
150150
q48 = q48 + (q12 * 81)
151-
q = np.concatenate([q48, q4], axis=1);
151+
q = np.concatenate([q48, q4], axis=1)
152152
q = ((q.astype(np.uint16) * 256) // 243).astype(np.uint8)
153-
q = np.where(q != 0, q + 1, 0);
153+
q = np.where(q != 0, q + 1, 0)
154154

155155
return q.reshape(__quantize_q1_3_shape_change(shape))
156156

@@ -170,4 +170,3 @@ def quantize_q1_3(data: np.ndarray):
170170
return __quantize_q1_3_lazy(data)
171171
else:
172172
return __quantize_q1_3_array(data)
173-

llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11699,7 +11699,7 @@ struct llm_build_context {
1169911699

1170011700
struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
1170111701
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
11702-
11702+
1170311703
cb(tmp, "ffn_up", il);
1170411704

1170511705
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);

0 commit comments

Comments
 (0)