Skip to content

Commit 0c64968

Browse files
committed
llama : refactor k-quant mixture logic into a function
1 parent 71ca2fa commit 0c64968

File tree

1 file changed

+113
-101
lines changed

1 file changed

+113
-101
lines changed

llama.cpp

Lines changed: 113 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -4696,6 +4696,116 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
46964696
}
46974697
}
46984698

4699+
#ifdef GGML_USE_K_QUANTS
4700+
static ggml_type get_k_quant_type(
4701+
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
4702+
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
4703+
) {
4704+
const std::string name = ggml_get_name(tensor);
4705+
// TODO: avoid hardcoded tensor names - use the TN_* constants
4706+
const auto tn = LLM_TN(model.arch);
4707+
4708+
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
4709+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4710+
};
4711+
4712+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4713+
int nx = tensor->ne[0];
4714+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4715+
new_type = GGML_TYPE_Q8_0;
4716+
}
4717+
else if (new_type != GGML_TYPE_Q8_0) {
4718+
new_type = GGML_TYPE_Q6_K;
4719+
}
4720+
} else if (name.find("attn_v.weight") != std::string::npos) {
4721+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4722+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4723+
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4724+
}
4725+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4726+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4727+
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4728+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4729+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4730+
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4731+
if (model.type == MODEL_70B) {
4732+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4733+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4734+
// nearly negligible increase in model size by quantizing this tensor with more bits:
4735+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4736+
}
4737+
++*i_attention_wv;
4738+
} else if (name.find("ffn_down.weight") != std::string::npos) {
4739+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4740+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4741+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4742+
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4743+
: GGML_TYPE_Q3_K;
4744+
}
4745+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4746+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4747+
}
4748+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4749+
if (model.arch == LLM_ARCH_FALCON) {
4750+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4751+
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4752+
} else {
4753+
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4754+
}
4755+
}
4756+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4757+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
4758+
new_type = GGML_TYPE_Q5_K;
4759+
}
4760+
++*i_feed_forward_w2;
4761+
} else if (name.find("attn_output.weight") != std::string::npos) {
4762+
if (model.arch != LLM_ARCH_FALCON) {
4763+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4764+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4765+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4766+
} else {
4767+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4768+
}
4769+
}
4770+
else if (name.find("attn_qkv.weight") != std::string::npos) {
4771+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4772+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4773+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4774+
}
4775+
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4776+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4777+
}
4778+
// This can be used to reduce the size of the Q5_K_S model.
4779+
// The associated PPL increase is fully in line with the size reduction
4780+
//else {
4781+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4782+
//}
4783+
bool convert_incompatible_tensor = false;
4784+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4785+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4786+
int nx = tensor->ne[0];
4787+
int ny = tensor->ne[1];
4788+
if (nx % QK_K != 0) {
4789+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4790+
convert_incompatible_tensor = true;
4791+
}
4792+
}
4793+
if (convert_incompatible_tensor) {
4794+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4795+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4796+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4797+
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4798+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4799+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4800+
} else {
4801+
throw std::runtime_error("Unsupported tensor size encountered\n");
4802+
}
4803+
}
4804+
4805+
return new_type;
4806+
}
4807+
#endif
4808+
46994809
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
47004810
ggml_type quantized_type;
47014811
llama_ftype ftype = params->ftype;
@@ -4781,12 +4891,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
47814891
std::vector<std::thread> workers;
47824892
std::mutex mutex;
47834893

4784-
#ifdef GGML_USE_K_QUANTS
4785-
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4786-
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4787-
};
4788-
#endif
4789-
47904894
int idx = 0;
47914895

47924896
std::vector<uint8_t> read_data;
@@ -4837,101 +4941,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
48374941
if (quantize) {
48384942
new_type = quantized_type;
48394943
#ifdef GGML_USE_K_QUANTS
4840-
// TODO: avoid hardcoded tensor names - use the TN_* constants
4841-
const auto tn = LLM_TN(ml->get_arch());
4842-
4843-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4844-
int nx = tensor->ne[0];
4845-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4846-
new_type = GGML_TYPE_Q8_0;
4847-
}
4848-
else if (new_type != GGML_TYPE_Q8_0) {
4849-
new_type = GGML_TYPE_Q6_K;
4850-
}
4851-
} else if (name.find("attn_v.weight") != std::string::npos) {
4852-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4853-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4854-
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4855-
}
4856-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4857-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4858-
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4859-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4860-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4861-
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4862-
if (model.type == MODEL_70B) {
4863-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4864-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4865-
// nearly negligible increase in model size by quantizing this tensor with more bits:
4866-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4867-
}
4868-
++i_attention_wv;
4869-
} else if (name.find("ffn_down.weight") != std::string::npos) {
4870-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4871-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4872-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4873-
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4874-
: GGML_TYPE_Q3_K;
4875-
}
4876-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4877-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4878-
}
4879-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4880-
if (model.arch == LLM_ARCH_FALCON) {
4881-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4882-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4883-
} else {
4884-
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4885-
}
4886-
}
4887-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4888-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4889-
new_type = GGML_TYPE_Q5_K;
4890-
}
4891-
++i_feed_forward_w2;
4892-
} else if (name.find("attn_output.weight") != std::string::npos) {
4893-
if (model.arch != LLM_ARCH_FALCON) {
4894-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4895-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4896-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4897-
} else {
4898-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4899-
}
4900-
}
4901-
else if (name.find("attn_qkv.weight") != std::string::npos) {
4902-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4903-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4904-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4905-
}
4906-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4907-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4908-
}
4909-
// This can be used to reduce the size of the Q5_K_S model.
4910-
// The associated PPL increase is fully in line with the size reduction
4911-
//else {
4912-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4913-
//}
4914-
bool convert_incompatible_tensor = false;
4915-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4916-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4917-
int nx = tensor->ne[0];
4918-
int ny = tensor->ne[1];
4919-
if (nx % QK_K != 0) {
4920-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4921-
convert_incompatible_tensor = true;
4922-
}
4923-
}
4924-
if (convert_incompatible_tensor) {
4925-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4926-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4927-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4928-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4929-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4930-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4931-
} else {
4932-
throw std::runtime_error("Unsupported tensor size encountered\n");
4933-
}
4934-
}
4944+
new_type = get_k_quant_type(
4945+
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
4946+
);
49354947
#endif
49364948
// If we've decided to quantize to the same type the tensor is already
49374949
// in then there's nothing to do.

0 commit comments

Comments
 (0)