Skip to content

Commit 0efec57

Browse files
authored
llama : valign + remove unused ftype (#8502)
1 parent 7acfd4e commit 0efec57

File tree

3 files changed

+72
-76
lines changed

3 files changed

+72
-76
lines changed

examples/quantize/quantize.cpp

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,44 +16,44 @@ struct quant_option {
1616
};
1717

1818
static const std::vector<struct quant_option> QUANT_OPTIONS = {
19-
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
20-
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
21-
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
22-
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
23-
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
24-
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
25-
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
26-
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
27-
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
28-
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
29-
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
30-
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
31-
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
32-
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
33-
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
34-
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
35-
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
36-
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
37-
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
38-
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
39-
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
40-
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
41-
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
42-
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
43-
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
44-
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
45-
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
46-
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
47-
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
48-
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
49-
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
50-
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
51-
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
52-
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
53-
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
54-
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
19+
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
20+
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
21+
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
22+
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
23+
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
24+
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
25+
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
26+
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
27+
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
28+
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
29+
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
30+
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
31+
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
32+
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
33+
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
34+
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
35+
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
36+
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
37+
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
38+
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
39+
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
40+
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
41+
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
42+
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
43+
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
44+
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
45+
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
46+
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
47+
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
48+
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
49+
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
50+
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
51+
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
52+
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
53+
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
54+
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
5555
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
56-
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
56+
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
5757
};
5858

5959
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";

include/llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ extern "C" {
133133
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
134134
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
135135
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
136-
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
136+
// LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
137137
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
138138
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
139139
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors

src/llama.cpp

Lines changed: 34 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4510,40 +4510,36 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
45104510
}
45114511

45124512
switch (ftype) {
4513-
case LLAMA_FTYPE_ALL_F32: return "all F32";
4514-
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
4515-
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
4516-
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
4517-
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
4518-
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
4519-
return "Q4_1, some F16";
4520-
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
4521-
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
4522-
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
4523-
4524-
// K-quants
4525-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
4526-
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
4527-
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
4528-
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
4529-
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
4530-
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
4531-
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
4532-
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
4533-
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
4534-
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
4535-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
4536-
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
4537-
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
4538-
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
4539-
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
4540-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
4541-
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
4542-
case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
4543-
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
4544-
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
4545-
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
4546-
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
4513+
case LLAMA_FTYPE_ALL_F32: return "all F32";
4514+
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
4515+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
4516+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
4517+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
4518+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
4519+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
4520+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
4521+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
4522+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
4523+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
4524+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
4525+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
4526+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
4527+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
4528+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
4529+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
4530+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
4531+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
4532+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
4533+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
4534+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
4535+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
4536+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
4537+
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
4538+
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
4539+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
4540+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
4541+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
4542+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
45474543
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
45484544
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
45494545
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
@@ -18069,10 +18065,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1806918065
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
1807018066
//}
1807118067
bool convert_incompatible_tensor = false;
18072-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
18073-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
18074-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
18075-
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
18068+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
18069+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
18070+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
18071+
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
1807618072
new_type == GGML_TYPE_IQ1_M) {
1807718073
int nx = tensor->ne[0];
1807818074
int ny = tensor->ne[1];

0 commit comments

Comments
 (0)