@@ -8829,6 +8829,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8829
8829
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
8830
8830
return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
8831
8831
};
8832
+ const int n_expert = std::max (1 , (int )qs.model .hparams .n_expert );
8833
+ auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
8834
+ if (n_expert > 1 ) {
8835
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8836
+ // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
8837
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
8838
+ // tensor name.
8839
+ n_layer /= n_expert;
8840
+ if (sscanf (name, " blk.%d." , &i_layer) != 1 ) {
8841
+ throw std::runtime_error (format (" Failed to determine layer for tensor %s" , name));
8842
+ }
8843
+ if (i_layer < 0 || i_layer >= n_layer) {
8844
+ throw std::runtime_error (format (" Bad layer %d for tensor %s. Must be in [0, %d)" , i_layer, name, n_layer));
8845
+ }
8846
+ }
8847
+ return std::make_pair (i_layer, n_layer);
8848
+ };
8832
8849
8833
8850
if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
8834
8851
int nx = tensor->ne [0 ];
@@ -8890,24 +8907,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8890
8907
new_type = GGML_TYPE_Q2_K;
8891
8908
}
8892
8909
} else if (name.find (" ffn_down" ) != std::string::npos) {
8893
- const int n_expert = std::max (1 , (int )qs.model .hparams .n_expert );
8894
- int i_layer, n_layer;
8895
- if (n_expert == 1 ) {
8896
- i_layer = qs.i_ffn_down ;
8897
- n_layer = qs.n_ffn_down ;
8898
- } else {
8899
- // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8900
- // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
8901
- // for getting the current layer as I initially thought, and we need to resort to parsing the
8902
- // tensor name.
8903
- n_layer = qs.n_ffn_down / n_expert;
8904
- if (sscanf (name.c_str (), " blk.%d.ffn_down" , &i_layer) != 1 ) {
8905
- throw std::runtime_error (format (" Failed to determine layer for tensor %s" , name.c_str ()));
8906
- }
8907
- if (i_layer < 0 || i_layer >= n_layer) {
8908
- throw std::runtime_error (format (" Bad layer %d for tensor %s. Must be in [0, %d)" , i_layer, name.c_str (), n_layer));
8909
- }
8910
- }
8910
+ auto info = layer_info (qs.i_ffn_down , qs.n_ffn_down , name.c_str ());
8911
+ int i_layer = info.first , n_layer = info.second ;
8911
8912
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8912
8913
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
8913
8914
if (i_layer < n_layer/8 ) new_type = GGML_TYPE_Q4_K;
@@ -8963,13 +8964,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8963
8964
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8964
8965
}
8965
8966
else if (name.find (" ffn_gate" ) != std::string::npos) {
8966
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (qs.i_ffn_gate , qs.n_ffn_gate )) {
8967
+ auto info = layer_info (qs.i_ffn_gate , qs.n_ffn_gate , name.c_str ());
8968
+ int i_layer = info.first , n_layer = info.second ;
8969
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (i_layer, n_layer)) {
8967
8970
new_type = GGML_TYPE_Q2_K;
8968
8971
}
8969
8972
++qs.i_ffn_gate ;
8970
8973
}
8971
8974
else if (name.find (" ffn_up" ) != std::string::npos) {
8972
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (qs.i_ffn_up , qs.n_ffn_up )) {
8975
+ auto info = layer_info (qs.i_ffn_up , qs.n_ffn_up , name.c_str ());
8976
+ int i_layer = info.first , n_layer = info.second ;
8977
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (i_layer, n_layer)) {
8973
8978
new_type = GGML_TYPE_Q2_K;
8974
8979
}
8975
8980
++qs.i_ffn_up ;
0 commit comments