Skip to content

Commit b3298fa

Browse files
authored
metal : refactor mat-vec code (#12569)
* metal : refactor mat-vec code ggml-ci * metal : rename all_sum -> sum_all ggml-ci * metal : fix comments [no ci] * metal : fix nr constant [no ci] * metal : mv q6_K support nr0 > 1 ggml-ci * metal : reduce register pressure ggml-ci * metal : fix typo [no ci] * metal : reduce register pressure ggml-ci
1 parent 2447ad8 commit b3298fa

File tree

3 files changed

+521
-498
lines changed

3 files changed

+521
-498
lines changed

ggml/src/ggml-metal/ggml-metal-impl.h

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,70 @@
11
#ifndef GGML_METAL_IMPL
22
#define GGML_METAL_IMPL
33

4+
// kernel parameters for mat-vec threadgroups
5+
//
6+
// N_R0: number of src0 rows to process per simdgroup
7+
// N_SG: number of simdgroups per threadgroup
8+
//
9+
// TODO: for optimal performance, become function of the device and work size
10+
11+
#define N_R0_Q4_0 4
12+
#define N_SG_Q4_0 2
13+
14+
#define N_R0_Q4_1 4
15+
#define N_SG_Q4_1 2
16+
17+
#define N_R0_Q5_0 4
18+
#define N_SG_Q5_0 2
19+
20+
#define N_R0_Q5_1 4
21+
#define N_SG_Q5_1 2
22+
23+
#define N_R0_Q8_0 4
24+
#define N_SG_Q8_0 2
25+
26+
#define N_R0_Q2_K 4
27+
#define N_SG_Q2_K 2
28+
29+
#define N_R0_Q3_K 2
30+
#define N_SG_Q3_K 2
31+
32+
#define N_R0_Q4_K 4
33+
#define N_SG_Q4_K 2
34+
35+
#define N_R0_Q5_K 2
36+
#define N_SG_Q5_K 2
37+
38+
#define N_R0_Q6_K 1
39+
#define N_SG_Q6_K 2
40+
41+
#define N_R0_IQ1_S 4
42+
#define N_SG_IQ1_S 2
43+
44+
#define N_R0_IQ1_M 4
45+
#define N_SG_IQ1_M 2
46+
47+
#define N_R0_IQ2_XXS 4
48+
#define N_SG_IQ2_XXS 2
49+
50+
#define N_R0_IQ2_XS 4
51+
#define N_SG_IQ2_XS 2
52+
53+
#define N_R0_IQ2_S 4
54+
#define N_SG_IQ2_S 2
55+
56+
#define N_R0_IQ3_XXS 4
57+
#define N_SG_IQ3_XXS 2
58+
59+
#define N_R0_IQ3_S 4
60+
#define N_SG_IQ3_S 2
61+
62+
#define N_R0_IQ4_NL 2
63+
#define N_SG_IQ4_NL 2
64+
65+
#define N_R0_IQ4_XS 2
66+
#define N_SG_IQ4_XS 2
67+
468
// kernel argument structs
569
//
670
// - element counters (e.g. ne00) typically use int32_t to reduce register usage

0 commit comments

Comments
 (0)