|
7 | 7 | #include <algorithm>
|
8 | 8 | #include <cmath>
|
9 | 9 | #include <cstring>
|
| 10 | +#include <cinttypes> |
10 | 11 | #include <fstream>
|
11 | 12 | #include <mutex>
|
12 | 13 | #include <thread>
|
13 | 14 | #include <unordered_map>
|
14 | 15 |
|
15 |
| -// TODO: replace with ggml API call |
16 |
| -#define QK_K 256 |
17 |
| - |
18 | 16 | static void zeros(std::ofstream & file, size_t n) {
|
19 | 17 | char zero = 0;
|
20 | 18 | for (size_t i = 0; i < n; ++i) {
|
@@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
154 | 152 | if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
155 | 153 | new_type = qs.params->output_tensor_type;
|
156 | 154 | } else {
|
157 |
| - int nx = tensor->ne[0]; |
158 |
| - if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { |
| 155 | + const int64_t nx = tensor->ne[0]; |
| 156 | + const int64_t qk_k = ggml_blck_size(new_type); |
| 157 | + |
| 158 | + if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) { |
159 | 159 | new_type = GGML_TYPE_Q8_0;
|
160 | 160 | }
|
161 | 161 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
@@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
367 | 367 | // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
368 | 368 | //}
|
369 | 369 | bool convert_incompatible_tensor = false;
|
370 |
| - if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || |
371 |
| - new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || |
372 |
| - new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || |
373 |
| - new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || |
374 |
| - new_type == GGML_TYPE_IQ1_M) { |
375 |
| - int nx = tensor->ne[0]; |
376 |
| - int ny = tensor->ne[1]; |
377 |
| - if (nx % QK_K != 0) { |
378 |
| - LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type)); |
| 370 | + { |
| 371 | + const int64_t nx = tensor->ne[0]; |
| 372 | + const int64_t ny = tensor->ne[1]; |
| 373 | + const int64_t qk_k = ggml_blck_size(new_type); |
| 374 | + |
| 375 | + if (nx % qk_k != 0) { |
| 376 | + LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); |
379 | 377 | convert_incompatible_tensor = true;
|
380 | 378 | } else {
|
381 | 379 | ++qs.n_k_quantized;
|
382 | 380 | }
|
383 | 381 | }
|
| 382 | + |
384 | 383 | if (convert_incompatible_tensor) {
|
385 | 384 | switch (new_type) {
|
386 | 385 | case GGML_TYPE_TQ1_0:
|
|
0 commit comments