@@ -3297,6 +3297,10 @@ struct llama_model_loader {
3297
3297
return nullptr;
3298
3298
}
3299
3299
3300
+ const llama_tensor_weight * get_weight(int i) const {
3301
+ return get_weight(get_tensor_name(i));
3302
+ }
3303
+
3300
3304
const llama_tensor_weight & require_weight(const char * name) const {
3301
3305
const llama_tensor_weight * weight = get_weight(name);
3302
3306
if (!weight) {
@@ -14528,26 +14532,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14528
14532
std::vector<no_init<uint8_t>> work;
14529
14533
std::vector<no_init<float>> f32_conv_buf;
14530
14534
14535
+ uint16_t n_split = 1;
14536
+ // Assume split index is continuous
14537
+ if (params->keep_split) {
14538
+ for (int i = 0; i < ml.n_tensors; ++i) {
14539
+ n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
14540
+ }
14541
+ }
14542
+ std::vector<gguf_context*> ctx_outs(n_split, NULL);
14543
+ ctx_outs[0] = ctx_out;
14544
+
14531
14545
// populate the original tensors so we get an initial meta data
14532
14546
for (int i = 0; i < ml.n_tensors; ++i) {
14533
- const struct ggml_tensor * meta = ml.get_tensor_meta(i);
14534
- gguf_add_tensor(ctx_out, meta);
14547
+ auto weight = ml.get_weight(i);
14548
+ uint16_t i_split = params->keep_split ? weight->idx : 0;
14549
+ struct ggml_tensor * tensor = weight->tensor;
14550
+ if (ctx_outs[i_split] == NULL) {
14551
+ ctx_outs[i_split] = gguf_init_empty();
14552
+ }
14553
+ gguf_add_tensor(ctx_outs[i_split], tensor);
14535
14554
}
14536
14555
14537
- std::ofstream fout(fname_out, std::ios::binary);
14538
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14539
-
14540
- const size_t meta_size = gguf_get_meta_size(ctx_out);
14556
+ // Set split info if needed
14557
+ if (n_split > 1) {
14558
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
14559
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
14560
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
14561
+ gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
14562
+ }
14563
+ }
14541
14564
14542
- LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
14565
+ int cur_split = -1;
14566
+ std::ofstream fout;
14567
+ auto close_ofstream = [&]() {
14568
+ // Write metadata and close file handler
14569
+ if (fout.is_open()) {
14570
+ fout.seekp(0);
14571
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
14572
+ gguf_get_meta_data(ctx_outs[cur_split], data.data());
14573
+ fout.write((const char *) data.data(), data.size());
14574
+ fout.close();
14575
+ }
14576
+ };
14577
+ auto new_ofstream = [&](int index = 0) {
14578
+ cur_split = index;
14579
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
14580
+ std::string fname = fname_out;
14581
+ if (params->keep_split) {
14582
+ char split_path[PATH_MAX] = {0};
14583
+ llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
14584
+ fname = std::string(split_path);
14585
+ }
14543
14586
14544
- // placeholder for the meta data
14545
- ::zeros(fout, meta_size);
14587
+ fout = std::ofstream(fname, std::ios::binary);
14588
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14589
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
14590
+ // placeholder for the meta data
14591
+ ::zeros(fout, meta_size);
14592
+ };
14546
14593
14547
14594
const auto tn = LLM_TN(model.arch);
14548
-
14595
+ new_ofstream();
14549
14596
for (int i = 0; i < ml.n_tensors; ++i) {
14550
- struct ggml_tensor * tensor = ml.get_tensor_meta(i);
14597
+ auto weight = ml.get_weight(i);
14598
+ struct ggml_tensor * tensor = weight->tensor;
14599
+ if (weight->idx != cur_split && params->keep_split) {
14600
+ close_ofstream();
14601
+ new_ofstream(weight->idx);
14602
+ }
14551
14603
14552
14604
const std::string name = ggml_get_name(tensor);
14553
14605
@@ -14702,26 +14754,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14702
14754
total_size_new += new_size;
14703
14755
14704
14756
// update the gguf meta data as we go
14705
- gguf_set_tensor_type(ctx_out , name.c_str(), new_type);
14706
- gguf_set_tensor_data(ctx_out , name.c_str(), new_data, new_size);
14757
+ gguf_set_tensor_type(ctx_outs[cur_split] , name.c_str(), new_type);
14758
+ gguf_set_tensor_data(ctx_outs[cur_split] , name.c_str(), new_data, new_size);
14707
14759
14708
14760
// write tensor data + padding
14709
14761
fout.write((const char *) new_data, new_size);
14710
14762
zeros(fout, GGML_PAD(new_size, align) - new_size);
14711
14763
}
14712
-
14713
- // go back to beginning of file and write the updated meta data
14714
- {
14715
- fout.seekp(0);
14716
- std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
14717
- gguf_get_meta_data(ctx_out, data.data());
14718
- fout.write((const char *) data.data(), data.size());
14764
+ close_ofstream();
14765
+ for (auto & c:ctx_outs) {
14766
+ gguf_free(c);
14719
14767
}
14720
14768
14721
- fout.close();
14722
-
14723
- gguf_free(ctx_out);
14724
-
14725
14769
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
14726
14770
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
14727
14771
@@ -15077,6 +15121,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
15077
15121
/*.quantize_output_tensor =*/ true,
15078
15122
/*.only_copy =*/ false,
15079
15123
/*.pure =*/ false,
15124
+ /*.keep_split =*/ false,
15080
15125
/*.imatrix =*/ nullptr,
15081
15126
/*.kv_overrides =*/ nullptr,
15082
15127
};
0 commit comments