Skip to content

Commit f9b38b9

Browse files
crasmggerganovcebtenzzre
authored andcommitted
llama : add ability to cancel model loading (ggml-org#4462)
* llama : Add ability to cancel model load Updated llama_progress_callback so that if it returns false, the model loading is aborted. * llama : Add test for model load cancellation * Fix bool return in llama_model_load, remove std::ignore use * Update llama.cpp Co-authored-by: Jared Van Bortel <[email protected]> * Fail test if model file is missing * Revert "Fail test if model file is missing" This reverts commit 32ebd52. * Add test-model-load-cancel to Makefile * Revert "Revert "Fail test if model file is missing"" This reverts commit 2796953. * Simplify .gitignore for tests, clang-tidy fixes * Label all ctest tests * ci : ctest uses -L main * Attempt at writing ctest_with_model * ci : get ci/run.sh working with test-model-load-cancel * ci : restrict .github/workflows/build.yml ctest to -L main * update requirements.txt * Disable test-model-load-cancel in make * Remove venv before creation * Restructure requirements.txt Top-level now imports the specific additional requirements for each python file. Using `pip install -r requirements.txt` will fail if versions become mismatched in the per-file requirements. * Make per-python-script requirements work alone This doesn't break the main requirements.txt. * Add comment * Add convert-persimmon-to-gguf.py to new requirements.txt scheme * Add check-requirements.sh script and GitHub workflow * Remove shellcheck installation step from workflow * Add nocleanup special arg * Fix merge see: ggml-org#4462 (comment) * reset to upstream/master * Redo changes for cancelling model load --------- Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: Jared Van Bortel <[email protected]>
1 parent 061e5f9 commit f9b38b9

File tree

2 files changed

+37
-15
lines changed

2 files changed

+37
-15
lines changed

llama.cpp

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2372,7 +2372,8 @@ struct llama_model_loader {
23722372
}
23732373
}
23742374

2375-
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
2375+
// Returns false if cancelled by progress_callback
2376+
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
23762377
size_t size_data = 0;
23772378

23782379
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
@@ -2404,7 +2405,9 @@ struct llama_model_loader {
24042405
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
24052406

24062407
if (progress_callback) {
2407-
progress_callback((float) size_done / size_data, progress_callback_user_data);
2408+
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
2409+
return false;
2410+
}
24082411
}
24092412

24102413
const size_t offs = file_offset(ggml_get_name(cur));
@@ -2466,8 +2469,11 @@ struct llama_model_loader {
24662469
}
24672470

24682471
if (progress_callback) {
2469-
progress_callback(1.0f, progress_callback_user_data);
2472+
// Even though the model is done loading, we still honor
2473+
// cancellation since we need to free allocations.
2474+
return progress_callback(1.0f, progress_callback_user_data);
24702475
}
2476+
return true;
24712477
}
24722478
};
24732479

@@ -3044,7 +3050,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
30443050
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
30453051
}
30463052

3047-
static void llm_load_tensors(
3053+
// Returns false if cancelled by progress_callback
3054+
static bool llm_load_tensors(
30483055
llama_model_loader & ml,
30493056
llama_model & model,
30503057
int n_gpu_layers,
@@ -3722,16 +3729,20 @@ static void llm_load_tensors(
37223729
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
37233730
}
37243731

3725-
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
3732+
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
3733+
return false;
3734+
}
37263735

37273736
model.mapping = std::move(ml.mapping);
37283737

37293738
// loading time will be recalculate after the first eval, so
37303739
// we take page faults deferred by mmap() into consideration
37313740
model.t_load_us = ggml_time_us() - model.t_start_us;
3741+
return true;
37323742
}
37333743

3734-
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
3744+
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
3745+
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
37353746
try {
37363747
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
37373748

@@ -3749,19 +3760,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
37493760

37503761
if (params.vocab_only) {
37513762
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
3752-
return true;
3763+
return 0;
37533764
}
37543765

3755-
llm_load_tensors(
3766+
if (!llm_load_tensors(
37563767
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
37573768
params.progress_callback, params.progress_callback_user_data
3758-
);
3769+
)) {
3770+
return -2;
3771+
}
37593772
} catch (const std::exception & err) {
37603773
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
3761-
return false;
3774+
return -1;
37623775
}
37633776

3764-
return true;
3777+
return 0;
37653778
}
37663779

37673780
//
@@ -9141,11 +9154,18 @@ struct llama_model * llama_load_model_from_file(
91419154
LLAMA_LOG_INFO("\n");
91429155
}
91439156
}
9157+
return true;
91449158
};
91459159
}
91469160

9147-
if (!llama_model_load(path_model, *model, params)) {
9148-
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
9161+
int status = llama_model_load(path_model, *model, params);
9162+
GGML_ASSERT(status <= 0);
9163+
if (status < 0) {
9164+
if (status == -1) {
9165+
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
9166+
} else if (status == -2) {
9167+
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
9168+
}
91499169
delete model;
91509170
return nullptr;
91519171
}

llama.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ extern "C" {
127127
bool sorted;
128128
} llama_token_data_array;
129129

130-
typedef void (*llama_progress_callback)(float progress, void *ctx);
130+
typedef bool (*llama_progress_callback)(float progress, void *ctx);
131131

132132
// Input data for llama_decode
133133
// A llama_batch object can contain input about one or many sequences
@@ -180,7 +180,9 @@ extern "C" {
180180
int32_t main_gpu; // the GPU that is used for scratch and small tensors
181181
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
182182

183-
// called with a progress value between 0 and 1, pass NULL to disable
183+
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
184+
// If the provided progress_callback returns true, model loading continues.
185+
// If it returns false, model loading is immediately aborted.
184186
llama_progress_callback progress_callback;
185187

186188
// context pointer passed to the progress callback

0 commit comments

Comments
 (0)