@@ -15599,8 +15599,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15599
15599
new_type = GGML_TYPE_Q5_K;
15600
15600
}
15601
15601
} else if (name.find("attn_q.weight") != std::string::npos) {
15602
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS ) new_type = GGML_TYPE_IQ3_XXS ;
15603
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ) new_type = GGML_TYPE_IQ2_S ;
15602
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ) new_type = GGML_TYPE_IQ2_S ;
15603
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ) new_type = GGML_TYPE_IQ3_XXS ;
15604
15604
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15605
15605
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
15606
15606
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
@@ -15715,9 +15715,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15715
15715
else if (name.find("ffn_gate") != std::string::npos) {
15716
15716
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
15717
15717
int i_layer = info.first, n_layer = info.second;
15718
- if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q3_K;
15719
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
15720
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q4_K;
15718
+ if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
15721
15719
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
15722
15720
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
15723
15721
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS;
@@ -15731,9 +15729,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15731
15729
else if (name.find("ffn_up") != std::string::npos) {
15732
15730
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
15733
15731
int i_layer = info.first, n_layer = info.second;
15734
- if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q3_K;
15735
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
15736
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q4_K;
15732
+ if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K;
15737
15733
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
15738
15734
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS;
15739
15735
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS;
@@ -16212,7 +16208,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16212
16208
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
16213
16209
}
16214
16210
16215
- LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
16211
+ LLAMA_LOG_INFO("converts to %s .. ", ggml_type_name(new_type));
16216
16212
fflush(stdout);
16217
16213
16218
16214
if (work.size() < (size_t)nelements * 4) {
0 commit comments