Skip to content

Commit ae92c72

Browse files
ngxsonNeoZhangJianyu
authored andcommitted
llama : add llama_model_load_from_splits (ggml-org#11255)
* llama : add `llama_model_load_from_splits` * update
1 parent 0cb12fe commit ae92c72

File tree

5 files changed

+116
-24
lines changed

5 files changed

+116
-24
lines changed

include/llama.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,10 +418,20 @@ extern "C" {
418418
struct llama_model_params params),
419419
"use llama_model_load_from_file instead");
420420

421+
// Load the model from a file
422+
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
423+
// If the split file name does not follow this pattern, use llama_model_load_from_splits
421424
LLAMA_API struct llama_model * llama_model_load_from_file(
422425
const char * path_model,
423426
struct llama_model_params params);
424427

428+
// Load the model from multiple splits (support custom naming scheme)
429+
// The paths must be in the correct order
430+
LLAMA_API struct llama_model * llama_model_load_from_splits(
431+
const char ** paths,
432+
size_t n_paths,
433+
struct llama_model_params params);
434+
425435
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
426436
"use llama_model_free instead");
427437

src/llama-model-loader.cpp

Lines changed: 63 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,33 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
6464
}
6565
}
6666

67+
// return a list of splits for a given path
68+
// for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
69+
static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
70+
std::vector<std::string> paths;
71+
std::string split_prefix;
72+
std::vector<char> buf(llama_path_max(), 0);
73+
74+
{
75+
int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
76+
if (!ret) {
77+
throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
78+
}
79+
split_prefix = std::string(buf.data(), ret);
80+
}
81+
82+
if (split_prefix.empty()) {
83+
throw std::runtime_error(format("invalid split file: %s", path.c_str()));
84+
}
85+
86+
for (int idx = 0; idx < n_split; ++idx) {
87+
int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
88+
paths.push_back(std::string(buf.data(), ret));
89+
}
90+
91+
return paths;
92+
}
93+
6794
namespace GGUFMeta {
6895
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
6996
struct GKV_Base_Type {
@@ -413,7 +440,12 @@ namespace GGUFMeta {
413440
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
414441
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
415442

416-
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
443+
llama_model_loader::llama_model_loader(
444+
const std::string & fname,
445+
std::vector<std::string> & splits,
446+
bool use_mmap,
447+
bool check_tensors,
448+
const struct llama_model_kv_override * param_overrides_p) {
417449
int trace = 0;
418450
if (getenv("LLAMA_TRACE")) {
419451
trace = atoi(getenv("LLAMA_TRACE"));
@@ -425,6 +457,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
425457
}
426458
}
427459

460+
// Load the main GGUF
428461
struct ggml_context * ctx = NULL;
429462
struct gguf_init_params params = {
430463
/*.no_alloc = */ true,
@@ -460,35 +493,54 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
460493

461494
// Load additional GGML contexts
462495
if (n_split > 1) {
496+
// make sure the main file is loaded first
463497
uint16_t idx = 0;
464-
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
498+
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
499+
get_key(kv_split_no, idx);
465500
if (idx != 0) {
466-
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
501+
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
502+
}
503+
504+
// generate list of splits if needed
505+
if (splits.empty()) {
506+
splits = llama_get_list_splits(fname, idx, n_split);
467507
}
468508

469-
std::vector<char> split_prefix(llama_path_max(), 0);
470-
if (!llama_split_prefix(split_prefix.data(), split_prefix.size(), fname.c_str(), idx, n_split)) {
471-
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
509+
// in case user give a custom list of splits, check if it matches the expected number
510+
if (n_split != (uint16_t)splits.size()) {
511+
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
472512
}
473513

474514
if (trace > 0) {
475515
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
476516
}
477517

478-
std::vector<char> split_path(llama_path_max(), 0);
518+
// load other splits
479519
for (idx = 1; idx < n_split; idx++) {
480-
llama_split_path(split_path.data(), split_path.size(), split_prefix.data(), idx, n_split);
520+
const char * fname_split = splits[idx].c_str();
481521

482522
struct gguf_init_params split_params = {
483523
/*.no_alloc = */ true,
484524
/*.ctx = */ &ctx,
485525
};
486-
gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path.data(), split_params) };
526+
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
487527
if (!ctx_gguf) {
488-
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path.data()));
528+
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
529+
}
530+
531+
// check idx
532+
{
533+
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
534+
if (kid < 0) {
535+
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
536+
}
537+
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
538+
if (idx_gguf != idx) {
539+
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
540+
}
489541
}
490542

491-
files.emplace_back(new llama_file(split_path.data(), "rb"));
543+
files.emplace_back(new llama_file(fname_split, "rb"));
492544
contexts.emplace_back(ctx);
493545

494546
// Save tensors data offset info of the shard.

src/llama-model-loader.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,12 @@ struct llama_model_loader {
9090
size_t size_data = 0;
9191
std::vector<std::pair<size_t, size_t>> mmaps_used;
9292

93-
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p);
93+
llama_model_loader(
94+
const std::string & fname,
95+
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
96+
bool use_mmap,
97+
bool check_tensors,
98+
const struct llama_model_kv_override * param_overrides_p);
9499

95100
template<typename T>
96101
typename std::enable_if<std::is_integral<T>::value, bool>::type

src/llama-quant.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
526526
kv_overrides = v->data();
527527
}
528528

529-
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
529+
std::vector<std::string> splits = {};
530+
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
530531
ml.init_mappings(false); // no prefetching
531532

532533
llama_model model(llama_model_default_params());

src/llama.cpp

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
#endif
3636

3737
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
38-
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
38+
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
3939
// loading time will be recalculated after the first eval, so
4040
// we take page faults deferred by mmap() into consideration
4141
model.t_load_us = 0;
@@ -44,7 +44,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4444
model.t_start_us = tm.t_start_us;
4545

4646
try {
47-
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
47+
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
4848

4949
ml.print_info();
5050

@@ -9378,14 +9378,9 @@ int64_t llama_time_us(void) {
93789378
return ggml_time_us();
93799379
}
93809380

9381-
struct llama_model * llama_load_model_from_file(
9382-
const char * path_model,
9383-
struct llama_model_params params) {
9384-
return llama_model_load_from_file(path_model, params);
9385-
}
9386-
9387-
struct llama_model * llama_model_load_from_file(
9388-
const char * path_model,
9381+
static struct llama_model * llama_model_load_from_file_impl(
9382+
const std::string & path_model,
9383+
std::vector<std::string> & splits,
93899384
struct llama_model_params params) {
93909385
ggml_time_init();
93919386

@@ -9489,7 +9484,7 @@ struct llama_model * llama_model_load_from_file(
94899484
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
94909485
}
94919486

9492-
const int status = llama_model_load(path_model, *model, params);
9487+
const int status = llama_model_load(path_model, splits, *model, params);
94939488
GGML_ASSERT(status <= 0);
94949489
if (status < 0) {
94959490
if (status == -1) {
@@ -9505,6 +9500,35 @@ struct llama_model * llama_model_load_from_file(
95059500
return model;
95069501
}
95079502

9503+
// deprecated
9504+
struct llama_model * llama_load_model_from_file(
9505+
const char * path_model,
9506+
struct llama_model_params params) {
9507+
return llama_model_load_from_file(path_model, params);
9508+
}
9509+
9510+
struct llama_model * llama_model_load_from_file(
9511+
const char * path_model,
9512+
struct llama_model_params params) {
9513+
std::vector<std::string> splits = {};
9514+
return llama_model_load_from_file_impl(path_model, splits, params);
9515+
}
9516+
9517+
struct llama_model * llama_model_load_from_splits(
9518+
const char ** paths,
9519+
size_t n_paths,
9520+
struct llama_model_params params) {
9521+
std::vector<std::string> splits;
9522+
if (n_paths == 0) {
9523+
LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
9524+
return nullptr;
9525+
}
9526+
for (size_t i = 0; i < n_paths; ++i) {
9527+
splits.push_back(paths[i]);
9528+
}
9529+
return llama_model_load_from_file_impl(splits.front(), splits, params);
9530+
}
9531+
95089532
struct llama_context * llama_init_from_model(
95099533
struct llama_model * model,
95109534
struct llama_context_params params) {

0 commit comments

Comments
 (0)