@@ -4696,6 +4696,116 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4696
4696
}
4697
4697
}
4698
4698
4699
+ #ifdef GGML_USE_K_QUANTS
4700
+ static ggml_type get_k_quant_type (
4701
+ ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
4702
+ int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
4703
+ ) {
4704
+ const std::string name = ggml_get_name (tensor);
4705
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
4706
+ const auto tn = LLM_TN (model.arch );
4707
+
4708
+ auto use_more_bits = [](int i_layer, int num_layers) -> bool {
4709
+ return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
4710
+ };
4711
+
4712
+ if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
4713
+ int nx = tensor->ne [0 ];
4714
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
4715
+ new_type = GGML_TYPE_Q8_0;
4716
+ }
4717
+ else if (new_type != GGML_TYPE_Q8_0) {
4718
+ new_type = GGML_TYPE_Q6_K;
4719
+ }
4720
+ } else if (name.find (" attn_v.weight" ) != std::string::npos) {
4721
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4722
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4723
+ new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4724
+ }
4725
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4726
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4727
+ use_more_bits (*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4728
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
4729
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4730
+ (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
4731
+ if (model.type == MODEL_70B) {
4732
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4733
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4734
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
4735
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4736
+ }
4737
+ ++*i_attention_wv;
4738
+ } else if (name.find (" ffn_down.weight" ) != std::string::npos) {
4739
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4740
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4741
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4742
+ : model.arch != LLM_ARCH_FALCON || use_more_bits (*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4743
+ : GGML_TYPE_Q3_K;
4744
+ }
4745
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4746
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4747
+ }
4748
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4749
+ if (model.arch == LLM_ARCH_FALCON) {
4750
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4751
+ use_more_bits (*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4752
+ } else {
4753
+ if (use_more_bits (*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4754
+ }
4755
+ }
4756
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4757
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4 ) {
4758
+ new_type = GGML_TYPE_Q5_K;
4759
+ }
4760
+ ++*i_feed_forward_w2;
4761
+ } else if (name.find (" attn_output.weight" ) != std::string::npos) {
4762
+ if (model.arch != LLM_ARCH_FALCON) {
4763
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4764
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4765
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4766
+ } else {
4767
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4768
+ }
4769
+ }
4770
+ else if (name.find (" attn_qkv.weight" ) != std::string::npos) {
4771
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4772
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4773
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4774
+ }
4775
+ else if (name.find (" ffn_gate.weight" ) != std::string::npos || name.find (" ffn_up.weight" ) != std::string::npos) {
4776
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4777
+ }
4778
+ // This can be used to reduce the size of the Q5_K_S model.
4779
+ // The associated PPL increase is fully in line with the size reduction
4780
+ // else {
4781
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4782
+ // }
4783
+ bool convert_incompatible_tensor = false ;
4784
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4785
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4786
+ int nx = tensor->ne [0 ];
4787
+ int ny = tensor->ne [1 ];
4788
+ if (nx % QK_K != 0 ) {
4789
+ LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for k-quants\n " , __func__, nx, ny, QK_K);
4790
+ convert_incompatible_tensor = true ;
4791
+ }
4792
+ }
4793
+ if (convert_incompatible_tensor) {
4794
+ if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
4795
+ new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
4796
+ LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
4797
+ } else if (name == tn (LLM_TENSOR_TOKEN_EMBD, " weight" )) {
4798
+ new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
4799
+ LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
4800
+ } else {
4801
+ throw std::runtime_error (" Unsupported tensor size encountered\n " );
4802
+ }
4803
+ }
4804
+
4805
+ return new_type;
4806
+ }
4807
+ #endif
4808
+
4699
4809
static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
4700
4810
ggml_type quantized_type;
4701
4811
llama_ftype ftype = params->ftype ;
@@ -4781,12 +4891,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4781
4891
std::vector<std::thread> workers;
4782
4892
std::mutex mutex;
4783
4893
4784
- #ifdef GGML_USE_K_QUANTS
4785
- auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4786
- return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
4787
- };
4788
- #endif
4789
-
4790
4894
int idx = 0 ;
4791
4895
4792
4896
std::vector<uint8_t > read_data;
@@ -4837,101 +4941,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4837
4941
if (quantize) {
4838
4942
new_type = quantized_type;
4839
4943
#ifdef GGML_USE_K_QUANTS
4840
- // TODO: avoid hardcoded tensor names - use the TN_* constants
4841
- const auto tn = LLM_TN (ml->get_arch ());
4842
-
4843
- if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
4844
- int nx = tensor->ne [0 ];
4845
- if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
4846
- new_type = GGML_TYPE_Q8_0;
4847
- }
4848
- else if (new_type != GGML_TYPE_Q8_0) {
4849
- new_type = GGML_TYPE_Q6_K;
4850
- }
4851
- } else if (name.find (" attn_v.weight" ) != std::string::npos) {
4852
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4853
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4854
- new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4855
- }
4856
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4857
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4858
- use_more_bits (i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4859
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
4860
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4861
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
4862
- if (model.type == MODEL_70B) {
4863
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4864
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4865
- // nearly negligible increase in model size by quantizing this tensor with more bits:
4866
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4867
- }
4868
- ++i_attention_wv;
4869
- } else if (name.find (" ffn_down.weight" ) != std::string::npos) {
4870
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4871
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4872
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4873
- : model.arch != LLM_ARCH_FALCON || use_more_bits (i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4874
- : GGML_TYPE_Q3_K;
4875
- }
4876
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4877
- new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4878
- }
4879
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4880
- if (model.arch == LLM_ARCH_FALCON) {
4881
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4882
- use_more_bits (i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4883
- } else {
4884
- if (use_more_bits (i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4885
- }
4886
- }
4887
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4888
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4 ) {
4889
- new_type = GGML_TYPE_Q5_K;
4890
- }
4891
- ++i_feed_forward_w2;
4892
- } else if (name.find (" attn_output.weight" ) != std::string::npos) {
4893
- if (model.arch != LLM_ARCH_FALCON) {
4894
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4895
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4896
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4897
- } else {
4898
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4899
- }
4900
- }
4901
- else if (name.find (" attn_qkv.weight" ) != std::string::npos) {
4902
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4903
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4904
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4905
- }
4906
- else if (name.find (" ffn_gate.weight" ) != std::string::npos || name.find (" ffn_up.weight" ) != std::string::npos) {
4907
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4908
- }
4909
- // This can be used to reduce the size of the Q5_K_S model.
4910
- // The associated PPL increase is fully in line with the size reduction
4911
- // else {
4912
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4913
- // }
4914
- bool convert_incompatible_tensor = false ;
4915
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4916
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4917
- int nx = tensor->ne [0 ];
4918
- int ny = tensor->ne [1 ];
4919
- if (nx % QK_K != 0 ) {
4920
- LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for k-quants\n " , __func__, nx, ny, QK_K);
4921
- convert_incompatible_tensor = true ;
4922
- }
4923
- }
4924
- if (convert_incompatible_tensor) {
4925
- if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
4926
- new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
4927
- LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
4928
- } else if (name == tn (LLM_TENSOR_TOKEN_EMBD, " weight" )) {
4929
- new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
4930
- LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
4931
- } else {
4932
- throw std::runtime_error (" Unsupported tensor size encountered\n " );
4933
- }
4934
- }
4944
+ new_type = get_k_quant_type (
4945
+ new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
4946
+ );
4935
4947
#endif
4936
4948
// If we've decided to quantize to the same type the tensor is already
4937
4949
// in then there's nothing to do.
0 commit comments