Skip to content

Commit ee57cbb

Browse files
committed
llama : replace gguf_file_saver with new gguf write API
1 parent 35177d7 commit ee57cbb

File tree

1 file changed

+15
-162
lines changed

1 file changed

+15
-162
lines changed

gguf-llama.cpp

Lines changed: 15 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -695,172 +695,14 @@ struct gguf_file_loader {
695695

696696
tensor.name = name;
697697
tensor.size = ggml_nbytes(cur);
698+
tensor.ggml_tensor = cur;
698699

699700
tensors_map.tensors.push_back(tensor);
700701
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
701702
}
702703
}
703704
};
704705

705-
struct gguf_file_saver {
706-
// TODO
707-
// this implementation now assumes that the data section is of the same length as the unquantized model.
708-
// this is needed to write tensor metadata and weights in a single pass by seeking to appropriate positions in the file.
709-
// this may not be true when we add quantization version and change ftype description (currently it's string according to the specs,
710-
// but better to have it as uint32).
711-
// we need to calculate the delta in number of bytes written with a counter as a struct member.
712-
713-
gguf_context * ctx; // loaded gguf context (used to re-write the KV section (good enough for now))
714-
715-
gguf_file file;
716-
size_t info_offset;
717-
size_t tensor_offset;
718-
719-
gguf_file_saver(const char * fname, gguf_context * ctx) : ctx(ctx), file(fname, "wb") {
720-
LLAMA_LOG_INFO("%s: saving model to %s\n", __func__, fname);
721-
722-
write_header();
723-
write_kv();
724-
}
725-
726-
void write_header() {
727-
file.write_i32(GGUF_MAGIC);
728-
file.write_i32(GGUF_VERSION);
729-
file.write_i32(gguf_get_n_tensors(ctx));
730-
file.write_i32(gguf_get_n_kv (ctx));
731-
}
732-
733-
void write_kv_arr_i32(const std::string & key, enum gguf_type type, int i, int n_arr) {
734-
std::vector<int32_t> data(n_arr);
735-
736-
for (int j = 0; j < n_arr; ++j) {
737-
int32_t val = gguf_get_arr_i32(ctx, i, j);
738-
data[j] = val;
739-
}
740-
741-
file.write_arr<int32_t>(key, type, data);
742-
}
743-
744-
void write_kv_arr_f32(const std::string & key, enum gguf_type type, int i, int n_arr) {
745-
std::vector<float> data(n_arr);
746-
747-
for (int j = 0; j < n_arr; ++j) {
748-
float val = gguf_get_arr_f32(ctx, i, j);
749-
data[j] = val;
750-
}
751-
752-
file.write_arr<float>(key, type, data);
753-
}
754-
755-
void write_kv_arr_str(const std::string & key, enum gguf_type type, int i, int n_arr) {
756-
std::vector<std::string> data(n_arr);
757-
758-
for (int j = 0; j < n_arr; ++j) {
759-
std::string val = gguf_get_arr_str(ctx, i, j);
760-
data[j] = val;
761-
}
762-
763-
file.write_arr(key, type, data);
764-
}
765-
766-
// re-write the key-value section from the loaded file
767-
void write_kv() {
768-
const int32_t n_kv = gguf_get_n_kv(ctx);
769-
for (int i = 0; i < n_kv; ++i) {
770-
const char * key = gguf_get_key(ctx, i);
771-
LLAMA_LOG_INFO("%s: writing key '%s'\n", __func__, key);
772-
773-
if (strcmp(key, "general.quantization_version") == 0) {
774-
file.write_val<uint32_t>("general.quantization_version", GGUF_TYPE_UINT32, GGML_QNT_VERSION);
775-
} else {
776-
const gguf_type vtype = gguf_get_kv_type(ctx, i);
777-
778-
switch (vtype) {
779-
case GGUF_TYPE_BOOL: file.write_val<bool> (key, GGUF_TYPE_BOOL, gguf_get_val_bool(ctx, i)); break;
780-
case GGUF_TYPE_FLOAT32: file.write_val<float> (key, GGUF_TYPE_FLOAT32, gguf_get_val_f32 (ctx, i)); break;
781-
case GGUF_TYPE_INT16: file.write_val<int16_t> (key, GGUF_TYPE_INT16, gguf_get_val_i16 (ctx, i)); break;
782-
case GGUF_TYPE_INT32: file.write_val<int32_t> (key, GGUF_TYPE_INT32, gguf_get_val_i32 (ctx, i)); break;
783-
case GGUF_TYPE_INT8: file.write_val<int8_t> (key, GGUF_TYPE_INT8, gguf_get_val_i8 (ctx, i)); break;
784-
case GGUF_TYPE_STRING: file.write_str (key, GGUF_TYPE_STRING, gguf_get_val_str (ctx, i)); break;
785-
case GGUF_TYPE_UINT16: file.write_val<uint16_t>(key, GGUF_TYPE_UINT16, gguf_get_val_u16 (ctx, i)); break;
786-
case GGUF_TYPE_UINT32: file.write_val<uint32_t>(key, GGUF_TYPE_UINT32, gguf_get_val_u32 (ctx, i)); break;
787-
case GGUF_TYPE_UINT8: file.write_val<uint8_t> (key, GGUF_TYPE_UINT8, gguf_get_val_u8 (ctx, i)); break;
788-
case GGUF_TYPE_ARRAY:
789-
{
790-
const gguf_type arr_type = gguf_get_arr_type(ctx, i);
791-
const int n_arr = gguf_get_arr_n (ctx, i);
792-
793-
switch (arr_type) {
794-
case GGUF_TYPE_FLOAT32: write_kv_arr_f32(key, arr_type, i, n_arr); break;
795-
case GGUF_TYPE_INT32: write_kv_arr_i32(key, arr_type, i, n_arr); break;
796-
case GGUF_TYPE_STRING: write_kv_arr_str(key, arr_type, i, n_arr); break;
797-
default:
798-
throw std::runtime_error(format("cannot recognize array type for key %s\n", key));
799-
}
800-
} break;
801-
default:
802-
throw std::runtime_error(format("cannot recognize value type for key %s\n", key));
803-
}
804-
}
805-
}
806-
807-
info_offset = file.tell();
808-
809-
GGML_ASSERT(gguf_get_data_offset(ctx) >= info_offset);
810-
811-
const size_t count = gguf_get_data_offset(ctx) - info_offset;
812-
813-
file.write_zeros(count);
814-
file.seek(info_offset, SEEK_SET);
815-
}
816-
817-
size_t write_tensor_info(gguf_load_tensor & tensor, enum ggml_type type) {
818-
size_t total_written = 0;
819-
file.seek(info_offset, SEEK_SET);
820-
total_written += file.write_str(tensor.name);
821-
822-
int32_t n_dims = tensor.ne.size();
823-
total_written += file.write_i32(n_dims);
824-
for (int32_t i = 0; i < n_dims; ++i) {
825-
total_written += file.write_i32(tensor.ne[i]);
826-
}
827-
828-
total_written += file.write_i32(type);
829-
total_written += file.write_u64(tensor_offset);
830-
info_offset += total_written; // position to write info of the next tensor
831-
832-
file.seek(0, SEEK_END);
833-
834-
return total_written;
835-
}
836-
837-
void write_tensor(gguf_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
838-
switch (new_type) {
839-
case GGML_TYPE_F32:
840-
case GGML_TYPE_F16:
841-
case GGML_TYPE_Q4_0:
842-
case GGML_TYPE_Q4_1:
843-
case GGML_TYPE_Q5_0:
844-
case GGML_TYPE_Q5_1:
845-
case GGML_TYPE_Q8_0:
846-
case GGML_TYPE_Q2_K:
847-
case GGML_TYPE_Q3_K:
848-
case GGML_TYPE_Q4_K:
849-
case GGML_TYPE_Q5_K:
850-
case GGML_TYPE_Q6_K:
851-
break;
852-
default: GGML_ASSERT(false);
853-
}
854-
855-
write_tensor_info(tensor, new_type);
856-
file.write_raw(new_data, new_size);
857-
size_t padded_size = GGML_PAD(new_size, GGUF_DEFAULT_ALIGNMENT); // TODO: handle custom alignment
858-
size_t pad = padded_size - new_size;
859-
file.write_zeros(pad);
860-
tensor_offset += padded_size; // offset of the next tensor
861-
}
862-
};
863-
864706
struct llama_model_loader {
865707
std::unique_ptr<gguf_file_loader> file_loader;
866708
gguf_load_tensors_map tensors_map;
@@ -897,7 +739,6 @@ struct llama_model_loader {
897739
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
898740
}
899741
ggml_set_name(tensor, lt.name.c_str());
900-
GGML_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
901742

902743
if (backend != GGML_BACKEND_CPU) {
903744
ggml_set_no_alloc(ggml_ctx, use_mmap);
@@ -936,6 +777,8 @@ struct llama_model_loader {
936777
file.seek(lt.file_off, SEEK_SET);
937778
file.read_raw(lt.data, lt.size);
938779
}
780+
781+
lt.ggml_tensor->data = lt.data; // TODO: not great
939782
}
940783

941784
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
@@ -3245,7 +3088,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
32453088
}
32463089

32473090
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
3248-
gguf_file_saver file_saver(fname_out.c_str(), model_loader->file_loader->gguf_ctx);
3091+
3092+
struct gguf_context * ctx_out = gguf_init_empty();
3093+
3094+
// copy the KV pairs from the input file
3095+
gguf_set_kv(ctx_out, model_loader->file_loader->gguf_ctx);
3096+
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
32493097

32503098
#ifdef GGML_USE_K_QUANTS
32513099
int n_attention_wv = 0;
@@ -3437,12 +3285,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
34373285
}
34383286
total_size_org += tensor.size;
34393287
total_size_new += new_size;
3440-
file_saver.write_tensor(tensor, new_type, new_data, new_size);
3288+
3289+
gguf_add_tensor(ctx_out, tensor.ggml_tensor);
34413290
}
34423291

3292+
gguf_write_to_file(ctx_out, fname_out.c_str());
3293+
gguf_free(ctx_out);
3294+
34433295
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
34443296
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
34453297

3298+
// print histogram for all tensors
34463299
{
34473300
int64_t sum_all = 0;
34483301
for (size_t i = 0; i < hist_all.size(); i++) {

0 commit comments

Comments
 (0)